Пример #1
0
def create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters):
    tasks = list()
    next_inputs = dict()
    topdir = os.path.join(os.path.dirname(fn(split_subreadsets_fofn_pfn)), 'run-fastas2fofn')
    # Create the fastas in parallel.
    for i, chunk_fn in enumerate(open(fn(split_subreadsets_fofn_pfn)).read().splitlines()):
        wdir = os.path.join(topdir, 'fasta_job_{:03d}'.format(i)) # TODO: 02
        chunk_pfn = makePypeLocalFile(os.path.join(wdir, chunk_fn))
        fasta_done_fn = os.path.join(wdir, 'chunk_{:03d}_done'.format(i)) # TODO: 02
        # By depending on a sentinel, we are allowed to delete fastas later.
        # Note: i might not match num in chunk_fn, but that is ok
        fasta_done_pfn = makePypeLocalFile(fasta_done_fn)
        make_task = PypeTask(
                inputs = {"dataset": chunk_pfn, },
                outputs =  {"fasta_done": fasta_done_pfn, },
                parameters = parameters,
        )
        task = make_task(start_task.task_bam2fasta_dexta)
        tasks.append(task)
        next_inputs['fasta_{}_done'.format(i)] = fasta_done_pfn
        #fasta_fn = base_from_done(fasta_done_fn) + '.fasta'  # By convention.
    # Create the FOFN of fastas.
    fasta_fofn_fn = os.path.join(topdir, 'fasta.fofn')
    fasta_fofn_pfn = makePypeLocalFile(fasta_fofn_fn)
    make_task = PypeTask(
            inputs = next_inputs,
            outputs =  {"fofn": fasta_fofn_pfn,
            },
            parameters = parameters,
    )
    task = make_task(start_task.task_fastas2fofn)
    tasks.append(task)
    return tasks, fasta_fofn_pfn
Пример #2
0
def create_quiver_jobs(scattered_quiver_plf):
    scattered_quiver_fn = fn(scattered_quiver_plf)
    jobs = json.loads(open(scattered_quiver_fn).read())
    #ctg_ids = sorted(jobs['ref_seq_data'])
    p_ctg_out = []
    h_ctg_out = []
    job_done_plfs = {}
    for job in jobs:
        ctg_id = job['ctg_id']
        m_ctg_id = ctg_id.split('-')[0]
        wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
        ref_fasta = makePypeLocalFile(
            os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id=ctg_id)))
        read_sam = makePypeLocalFile(
            os.path.join(
                os.getcwd(), './4-quiver/reads/'
                '{ctg_id}.sam'.format(ctg_id=ctg_id)))
        cns_fasta = makePypeLocalFile(
            os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id=ctg_id)))
        cns_fastq = makePypeLocalFile(
            os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id=ctg_id)))
        job_done = makePypeLocalFile(
            os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id=ctg_id)))

        if os.path.exists(
                fn(read_sam
                   )):  # TODO(CD): Ask Jason what we should do if missing SAM.
            if ctg_types[ctg_id] == 'p':
                p_ctg_out.append((fn(cns_fasta), fn(cns_fastq)))
            elif ctg_types[ctg_id] == 'h':
                h_ctg_out.append((fn(cns_fasta), fn(cns_fastq)))
            else:
                LOG.warning(
                    'Type is {!r}, not "p" or "h". Why are we running Quiver?'.
                    format(ctg_types[ctg_id]))
            parameters = {
                'job_uid': 'q-' + ctg_id,
                'wd': wd,
                'config': config,
                'ctg_id': ctg_id
            }
            make_quiver_task = PypeTask(
                inputs={
                    'ref_fasta': ref_fasta,
                    'read_sam': read_sam,
                    'scattered_quiver': scattered_quiver_plf,
                },
                outputs={
                    'cns_fasta': cns_fasta,
                    'cns_fastq': cns_fastq,
                    'job_done': job_done
                },
                parameters=parameters,
            )
            quiver_task = make_quiver_task(task_run_quiver)
            wf.addTask(quiver_task)
            job_done_plfs['{}'.format(ctg_id)] = job_done
    #sge_quiver = config['sge_quiver']
    return p_ctg_out, h_ctg_out, job_done_plfs
Пример #3
0
def create_merge_gather_task(wd, inputs):
    las_fofn_plf = makePypeLocalFile(os.path.join(wd, 'las.fofn'))
    las_fopfn_plf = makePypeLocalFile(os.path.join(wd, 'las.fopfn'))

    make_task = PypeTask(inputs=inputs,  # p_ids_merged_las
                         outputs={'las_fofn': las_fofn_plf,
                                  'las_fopfn': las_fopfn_plf,
                                  },
                         )
    task = make_task(pype_tasks.task_merge_gather)
    return task, las_fofn_plf, las_fopfn_plf
Пример #4
0
def create_tasks_pbalign(chunk_json_pfn, referenceset_pfn, parameters):
    """Create a pbalign task for each chunk, plus a gathering task.
    """
    tasks = list()
    gathering = dict()
    chunk_dir = os.path.dirname(fn(chunk_json_pfn))
    for i, subreadset_fn in enumerate(
            sorted(
                yield_pipeline_chunk_names_from_json(open(fn(chunk_json_pfn)),
                                                     '$chunk.subreadset_id'))):
        wdir = 'run-pbalign-{:02d}'.format(i)
        subreadset_fn = os.path.join(chunk_dir,
                                     os.path.basename(subreadset_fn))
        subreadset_pfn = makePypeLocalFile(subreadset_fn)
        unmapped_pfn = makePypeLocalFile(
            '{wdir}/unmapped.txt'.format(**locals()))
        alignmentset_pfn = makePypeLocalFile(
            '{wdir}/align.subreads.{i:02d}.alignmentset.xml'.format(
                **locals()))
        gathering['unmapped_{:02d}'.format(i)] = unmapped_pfn
        gathering['alignmentsets_{:02d}'.format(i)] = alignmentset_pfn
        """Also produces:
        aligned.subreads.i.alignmentset.bam
        aligned.subreads.i.alignmentset.bam.bai
        aligned.subreads.i.alignmentset.bam.pbi
        """
        make_task = PypeTask(
            inputs={
                "chunk_json": chunk_json_pfn,
                "dataset": subreadset_pfn,
                "referenceset": referenceset_pfn,
            },
            outputs={
                "alignmentset": alignmentset_pfn,
                "unmapped": unmapped_pfn,
            },
            parameters=parameters,
        )
        task = make_task(start_task.task_pbalign)
        tasks.append(task)
    o_alignmentset_pfn = makePypeLocalFile(
        'run-pbalign_gather/aligned.subreads.alignmentset.xml')
    o_unmapped_pfn = makePypeLocalFile('run-pbalign_gather/unmapped.txt')
    make_task = PypeTask(
        inputs=gathering,
        outputs={
            "o_ds": o_alignmentset_pfn,
            "o_unmapped": o_unmapped_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_pbalign_gather)
    tasks.append(task)
    return tasks, alignmentset_pfn
Пример #5
0
def create_merge_gather_task(wd, inputs):
    las_fofn_plf = makePypeLocalFile(os.path.join(wd, 'las.fofn'))
    las_fopfn_plf = makePypeLocalFile(os.path.join(wd, 'las.fopfn'))

    make_task = PypeTask(inputs = inputs, # p_ids_merged_las
                         outputs =  {'las_fofn': las_fofn_plf,
                                     'las_fopfn': las_fopfn_plf,
                         },
                         TaskType = MyFakePypeThreadTaskBase,
    )
    #                     URL = 'task://localhost/pmerge_gather')
    task = make_task(pype_tasks.task_merge_gather)
    return task, las_fofn_plf, las_fopfn_plf
Пример #6
0
def create_merge_gather_task(wd, inputs):
    las_fofn_plf = makePypeLocalFile(os.path.join(wd, 'las.fofn'))
    las_fopfn_plf = makePypeLocalFile(os.path.join(wd, 'las.fopfn'))

    make_task = PypeTask(
        inputs=inputs,  # p_ids_merged_las
        outputs={
            'las_fofn': las_fofn_plf,
            'las_fopfn': las_fopfn_plf,
        },
    )
    task = make_task(pype_tasks.task_merge_gather)
    return task, las_fofn_plf, las_fopfn_plf
Пример #7
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global LOG
    LOG = support.setup_logger(logger_config_fn)
    lfs_setstripe_maybe(path='.', stripe=12)

    LOG.info('fc_run started with configuration %s', input_config_fn)
    try:
        config = support.parse_cfg_file(input_config_fn)
        import json
        dumped = json.dumps(config, indent=2, separators=(',', ': '), sort_keys=True)
        LOG.info('cfg=\n{}'.format(dumped))
    except Exception:
        LOG.exception('Failed to parse config "{}".'.format(input_config_fn))
        raise
    general_config = config['General']
    assert 'input_fofn' in general_config, 'Missing "input_fofn" in {}.'.format(input_config_fn)
    input_fofn_plf = makePypeLocalFile(general_config['input_fofn'])
    genome_size = int(general_config.get('genome_size', '0'))
    squash = True if 0 < genome_size < 1000000 else False
    wf = PypeProcWatcherWorkflow(job_defaults=config['job.defaults'],
                                 squash=squash,
    )
    general_config['ver'] = '100'
    config_fn = './config.json' # must not be in a task-dir
    io.serialize(config_fn, config)
    with open('foo.snake', 'w') as snakemake_writer:
        rule_writer = snakemake.SnakemakeRuleWriter(snakemake_writer)
        run(wf, config, rule_writer,
            os.path.abspath(config_fn),
            input_fofn_plf=input_fofn_plf,
            )
Пример #8
0
def create_merge_tasks(basedir, scatter_fn):
    tasks = []
    p_ids_merged_las = {}  # for consensus
    content = json.loads(open(scatter_fn).read())  # array of descriptions
    for section in content:
        parameters = section['parameters']
        aligner = parameters['config']['aligner']
        inputs = section['inputs']
        inputs['scatter_fn'] = scatter_fn
        outputs = section['outputs']
        URL = section['URL']
        p_id = parameters['job_id']
        #merge_script = parameters['merge_script']
        #sge_option = parameters['sge_option']
        wdir = os.path.join(basedir, 'm_%05d' % p_id)
        if aligner == 'minialign':
            for input_fn, input_fpath in inputs.items():
                inputs[input_fn] = makePypeLocalFile(input_fpath)
        make_task = PypeTask(
            inputs=inputs,
            outputs=outputs,
            parameters=parameters,
            wdir=wdir,
        )
        task = make_task(pype_tasks.task_run_las_merge if aligner ==
                         'daligner' else pype_tasks.task_run_aligner)
        tasks.append(task)
        ovl_file = task.outputs[
            'merged_las' if aligner == 'daligner' else
            'ovl_fn']  # these are relative, so we need the PypeLocalFiles
        p_ids_merged_las[p_id] = ovl_file
    return tasks, p_ids_merged_las
Пример #9
0
def create_consensus_tasks(basedir, scatter_fn):
    consensus_tasks = []
    consensus_out = {}
    content = json.loads(open(scatter_fn).read())  # array of descriptions
    for section in content:
        parameters = section['parameters']
        aligner = parameters['config']['aligner']
        inputs = section['inputs']
        inputs['scatter_fn'] = scatter_fn
        outputs = section['outputs']
        URL = section['URL']
        p_id = int(parameters['job_id'])
        cns_label = 'cns_%05d' % int(p_id)
        wdir = os.path.join(basedir, 'preads', cns_label)
        if aligner == 'minialign':
            for input_fn, input_fpath in inputs.items():
                inputs[input_fn] = makePypeLocalFile(input_fpath)
        make_c_task = PypeTask(
            inputs=inputs,
            outputs=outputs,
            parameters=parameters,
            wdir=wdir,
        )
        c_task = make_c_task(pype_tasks.task_run_consensus)
        consensus_tasks.append(c_task)
        consensus_out['cjob_%d' % p_id] = outputs['out_file']
    return consensus_tasks, consensus_out
Пример #10
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global fc_run_logger
    fc_run_logger = support.setup_logger(logger_config_fn)

    fc_run_logger.info('fc_run started with configuration %s', input_config_fn)
    try:
        config = support.get_dict_from_old_falcon_cfg(
            support.parse_config(input_config_fn))
    except Exception:
        fc_run_logger.exception(
            'Failed to parse config "{}".'.format(input_config_fn))
        raise
    input_fofn_plf = makePypeLocalFile(config['input_fofn'])
    genome_size = config.get('genome_size')
    squash = True if 0 < genome_size < 1000000 else False
    wf = PypeProcWatcherWorkflow(
        job_type=config['job_type'],
        job_queue=config['job_queue'],
        sge_option=config.get('sge_option', ''),
        watcher_type=config['pwatcher_type'],
        watcher_directory=config['pwatcher_directory'],
        use_tmpdir=config.get('use_tmpdir'),
        squash=squash)
    run(
        wf,
        config,
        os.path.abspath(input_config_fn),
        input_fofn_plf=input_fofn_plf,
    )
Пример #11
0
def create_task_old():
    i1 = './in/i1'
    o1 = './run/dir1/o1.txt'
    i1 = makePypeLocalFile(i1)
    o1 = makePypeLocalFile(o1)
    parameters = {}
    make_task = PypeTask(
            inputs={
                'i1': i1,
            },
            outputs={
                'o1': o1,
            },
            parameters=parameters,
            )
    return make_task(taskA)
Пример #12
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global fc_run_logger
    fc_run_logger = support.setup_logger(logger_config_fn)

    fc_run_logger.info('fc_run started with configuration %s', input_config_fn)
    try:
        config = support.get_dict_from_old_falcon_cfg(support.parse_config(input_config_fn))
    except Exception:
        fc_run_logger.exception('Failed to parse config "{}".'.format(input_config_fn))
        raise
    input_fofn_plf = makePypeLocalFile(config['input_fofn'])
    genome_size = config.get('genome_size')
    squash = True if 0 < genome_size < 1000000 else False
    wf = PypeProcWatcherWorkflow(job_type=config['job_type'],
            job_queue=config['job_queue'],
            sge_option=config.get('sge_option', ''),
            watcher_type=config['pwatcher_type'],
            watcher_directory=config['pwatcher_directory'],
            use_tmpdir=config.get('use_tmpdir'),
            squash=squash
    )
    run(wf, config,
            os.path.abspath(input_config_fn),
            input_fofn_plf=input_fofn_plf,
    )
Пример #13
0
def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
    read_map_dir = os.path.abspath(os.path.join(asm_dir, 'read_maps'))
    make_dirs(read_map_dir)

    wf = PypeProcWatcherWorkflow(
        max_jobs=12,
    )
    """
            job_type=config['job_type'],
            job_queue=config['job_queue'],
            sge_option=config.get('sge_option', ''),
            watcher_type=config['pwatcher_type'],
            watcher_directory=config['pwatcher_directory'])
    """

    rawread_db = makePypeLocalFile(os.path.join(rawread_dir, 'raw_reads.db'))
    rawread_id_file = makePypeLocalFile(os.path.join(
        read_map_dir, 'dump_rawread_ids', 'rawread_ids'))

    task = PypeTask(
        inputs={'rawread_db': rawread_db},
        outputs={'rawread_id_file': rawread_id_file},
    )
    wf.addTask(task(pype_tasks.task_dump_rawread_ids))

    pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db'))
    pread_id_file = makePypeLocalFile(os.path.join(
        read_map_dir, 'dump_pread_ids', 'pread_ids'))

    task = PypeTask(
        inputs={'pread_db': pread_db},
        outputs={'pread_id_file': pread_id_file},
    )
    wf.addTask(task(pype_tasks.task_dump_pread_ids))

    wf.refreshTargets()  # block

    sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, 'sg_edges_list'))
    utg_data = makePypeLocalFile(os.path.join(asm_dir, 'utg_data'))
    ctg_paths = makePypeLocalFile(os.path.join(asm_dir, 'ctg_paths'))

    inputs = {'rawread_id_file': rawread_id_file,
              'pread_id_file': pread_id_file,
              'sg_edges_list': sg_edges_list,
              'utg_data': utg_data,
              'ctg_paths': ctg_paths}

    read_to_contig_map = makePypeLocalFile(os.path.join(
        read_map_dir, 'get_ctg_read_map', 'read_to_contig_map'))

    task = PypeTask(
        inputs=inputs,
        outputs={'read_to_contig_map': read_to_contig_map},
    )
    wf.addTask(task(pype_tasks.task_generate_read_to_ctg_map))

    wf.refreshTargets()  # block
Пример #14
0
def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
    read_map_dir = os.path.abspath(os.path.join(asm_dir, 'read_maps'))
    make_dirs(read_map_dir)

    wf = PypeProcWatcherWorkflow(max_jobs=12, )
    """
            job_type=config['job_type'],
            job_queue=config['job_queue'],
            sge_option=config.get('sge_option', ''),
            watcher_type=config['pwatcher_type'],
            watcher_directory=config['pwatcher_directory'])
    """

    rawread_db = makePypeLocalFile(os.path.join(rawread_dir, 'raw_reads.db'))
    rawread_id_file = makePypeLocalFile(
        os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids'))

    task = PypeTask(inputs={'rawread_db': rawread_db},
                    outputs={'rawread_id_file': rawread_id_file},
                    TaskType=PypeThreadTaskBase,
                    URL='task://localhost/dump_rawread_ids')
    wf.addTask(task(dump_rawread_ids))

    pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db'))
    pread_id_file = makePypeLocalFile(
        os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids'))

    task = PypeTask(inputs={'pread_db': pread_db},
                    outputs={'pread_id_file': pread_id_file},
                    TaskType=PypeThreadTaskBase,
                    URL='task://localhost/dump_pread_ids')
    wf.addTask(task(dump_pread_ids))

    wf.refreshTargets()  # block

    sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, 'sg_edges_list'))
    utg_data = makePypeLocalFile(os.path.join(asm_dir, 'utg_data'))
    ctg_paths = makePypeLocalFile(os.path.join(asm_dir, 'ctg_paths'))

    inputs = {
        'rawread_id_file': rawread_id_file,
        'pread_id_file': pread_id_file,
        'sg_edges_list': sg_edges_list,
        'utg_data': utg_data,
        'ctg_paths': ctg_paths
    }

    read_to_contig_map = makePypeLocalFile(
        os.path.join(read_map_dir, 'get_ctg_read_map', 'read_to_contig_map'))

    task = PypeTask(inputs=inputs,
                    outputs={'read_to_contig_map': read_to_contig_map},
                    TaskType=PypeThreadTaskBase,
                    URL='task://localhost/get_ctg_read_map')
    wf.addTask(task(generate_read_to_ctg_map))

    wf.refreshTargets()  # block
Пример #15
0
def create_quiver_jobs(wf, scattered_quiver_plf):
    scattered_quiver_fn = fn(scattered_quiver_plf)
    jobs = json.loads(open(scattered_quiver_fn).read())
    #ctg_ids = sorted(jobs['ref_seq_data'])
    p_ctg_out=[]
    h_ctg_out=[]
    job_done_plfs = {}
    for job in jobs:
        ctg_id = job['ctg_id']
        ctg_types = job['ctg_types']
        smrt_bin = job['smrt_bin']
        sge_option = job['sge_option']
        ref_fasta = makePypeLocalFile(job['ref_fasta'])
        read_bam = makePypeLocalFile(job['read_bam'])
        m_ctg_id = ctg_id.split('-')[0]
        wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
        #ref_fasta = makePypeLocalFile(os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id)))
        #read_bam = makePypeLocalFile(os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id)))
        cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
        cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
        job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))

        if os.path.exists(fn(read_bam)): # TODO(CD): Ask Jason what we should do if missing SAM.
            if ctg_types[ctg_id] == 'p':
                p_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
            elif ctg_types[ctg_id] == 'h':
                h_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
            else:
                LOG.warning('Type is {!r}, not "p" or "h". Why are we running Quiver?'.format(ctg_types[ctg_id]))
            parameters = {
                    'job_uid':'q-'+ctg_id,
                    'ctg_id': ctg_id,
                    'smrt_bin': smrt_bin,
                    'sge_option': sge_option,
            }
            make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_bam': read_bam,
                                         'scattered_quiver': scattered_quiver_plf,
                                       },
                                       outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
                                       parameters = parameters,
            )
            quiver_task = make_quiver_task(task_run_quiver)
            wf.addTask(quiver_task)
            job_done_plfs['{}'.format(ctg_id)] = job_done
    return p_ctg_out, h_ctg_out, job_done_plfs
Пример #16
0
def gen_task(script, inputs, outputs, parameters={}):
    def validate_dict(mydict):
        "Python identifiers are illegal as keys."
        try:
            collections.namedtuple('validate', mydict.keys())
        except ValueError as exc:
            LOG.exception('Bad key name in task definition dict {!r}'.format(mydict))
            raise
    validate_dict(inputs)
    validate_dict(outputs)
    validate_dict(parameters)
    parameters['_bash_'] = script
    make_task = PypeTask(
            inputs={k: makePypeLocalFile(v) for k,v in inputs.iteritems()},
            outputs={k: makePypeLocalFile(v) for k,v in outputs.iteritems()},
            parameters=parameters,
            )
    return make_task(task_generic_bash_script)
Пример #17
0
def create_consensus_gather_task(wd, inputs):
    # Happens only in stage-0.
    preads_fofn_plf = makePypeLocalFile(os.path.join(wd, 'input_preads.fofn'))

    make_cns_gather_task = PypeTask(
        inputs=inputs,  # consensus_out
        outputs={'preads_fofn': preads_fofn_plf},
    )
    task = make_cns_gather_task(pype_tasks.task_cns_gather)
    return task, preads_fofn_plf
Пример #18
0
def create_tasks_pbalign(chunk_json_pfn, referenceset_pfn, parameters):
    """Create a pbalign task for each chunk, plus a gathering task.
    """
    tasks = list()
    gathering = dict()
    chunk_dir = os.path.dirname(fn(chunk_json_pfn))
    for i, subreadset_fn in enumerate(sorted(yield_pipeline_chunk_names_from_json(open(fn(chunk_json_pfn)), '$chunk.subreadset_id'))):
        wdir = 'run-pbalign-{:02d}'.format(i)
        subreadset_fn = os.path.join(chunk_dir, os.path.basename(subreadset_fn))
        subreadset_pfn = makePypeLocalFile(subreadset_fn)
        unmapped_pfn = makePypeLocalFile('{wdir}/unmapped.txt'.format(**locals()))
        alignmentset_pfn = makePypeLocalFile('{wdir}/align.subreads.{i:02d}.alignmentset.xml'.format(**locals()))
        gathering['unmapped_{:02d}'.format(i)] = unmapped_pfn
        gathering['alignmentsets_{:02d}'.format(i)] = alignmentset_pfn
        """Also produces:
        aligned.subreads.i.alignmentset.bam
        aligned.subreads.i.alignmentset.bam.bai
        aligned.subreads.i.alignmentset.bam.pbi
        """
        make_task = PypeTask(
                inputs = {"chunk_json": chunk_json_pfn,
                          "dataset": subreadset_pfn,
                          "referenceset": referenceset_pfn,
                },
                outputs = {"alignmentset": alignmentset_pfn,
                           "unmapped": unmapped_pfn,
                },
                parameters = parameters,
        )
        task = make_task(start_task.task_pbalign)
        tasks.append(task)
    o_alignmentset_pfn = makePypeLocalFile('run-pbalign_gather/aligned.subreads.alignmentset.xml')
    o_unmapped_pfn = makePypeLocalFile('run-pbalign_gather/unmapped.txt')
    make_task = PypeTask(
            inputs = gathering,
            outputs = {"o_ds": o_alignmentset_pfn,
                       "o_unmapped": o_unmapped_pfn,
            },
            parameters = parameters,
    )
    task = make_task(start_task.task_pbalign_gather)
    tasks.append(task)
    return tasks, alignmentset_pfn
Пример #19
0
def create_consensus_gather_task(wd, inputs):
    # Happens only in stage-0.
    preads_fofn_plf = makePypeLocalFile(os.path.join(wd, 'input_preads.fofn'))

    make_cns_gather_task = PypeTask(
        inputs=inputs,  # consensus_out
        outputs={'preads_fofn': preads_fofn_plf},
    )
    task = make_cns_gather_task(pype_tasks.task_cns_gather)
    return task, preads_fofn_plf
Пример #20
0
def create_consensus_gather_task(wd, inputs):
    # Happens only in stage-0.
    preads_fofn_plf = makePypeLocalFile(os.path.join(wd, 'input_preads.fofn'))

    make_cns_gather_task = PypeTask(
                inputs = inputs, # consensus_out
                outputs =  {'preads_fofn': preads_fofn_plf},
                TaskType = MyFakePypeThreadTaskBase,
                URL = 'task://localhost/cns_gather' )
    task = make_cns_gather_task(pype_tasks.task_cns_gather)
    return task, preads_fofn_plf
Пример #21
0
def create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters):
    tasks = list()
    next_inputs = dict()
    topdir = os.path.join(os.path.dirname(fn(split_subreadsets_fofn_pfn)),
                          'run-fastas2fofn')
    # Create the fastas in parallel.
    for i, chunk_fn in enumerate(
            open(fn(split_subreadsets_fofn_pfn)).read().splitlines()):
        wdir = os.path.join(topdir, 'fasta_job_{:03d}'.format(i))  # TODO: 02
        chunk_pfn = makePypeLocalFile(os.path.join(wdir, chunk_fn))
        fasta_done_fn = os.path.join(wdir,
                                     'chunk_{:03d}_done'.format(i))  # TODO: 02
        # By depending on a sentinel, we are allowed to delete fastas later.
        # Note: i might not match num in chunk_fn, but that is ok
        fasta_done_pfn = makePypeLocalFile(fasta_done_fn)
        make_task = PypeTask(
            inputs={
                "dataset": chunk_pfn,
            },
            outputs={
                "fasta_done": fasta_done_pfn,
            },
            parameters=parameters,
        )
        task = make_task(start_task.task_bam2fasta_dexta)
        tasks.append(task)
        next_inputs['fasta_{}_done'.format(i)] = fasta_done_pfn
        #fasta_fn = base_from_done(fasta_done_fn) + '.fasta'  # By convention.
    # Create the FOFN of fastas.
    fasta_fofn_fn = os.path.join(topdir, 'fasta.fofn')
    fasta_fofn_pfn = makePypeLocalFile(fasta_fofn_fn)
    make_task = PypeTask(
        inputs=next_inputs,
        outputs={
            "fofn": fasta_fofn_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_fastas2fofn)
    tasks.append(task)
    return tasks, fasta_fofn_pfn
def main():
    lfn = 'logging-cfg.json'
    if os.path.exists(lfn):
        logging.config.dictConfig(json.load(open(lfn)))
    else:
        logging.basicConfig()
        logging.getLogger().setLevel(logging.NOTSET)
        try:
            import logging_tree
            logging_tree.printout()
        except ImportError:
            pass
    log.debug('DEBUG LOGGING ON')
    log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format(
        JOB_TYPE, SLEEP_S))
    exitOnFailure=False
    concurrent_jobs=2
    Workflow = PypeProcWatcherWorkflow
    wf = Workflow(job_type=JOB_TYPE)
    wf.max_jobs = concurrent_jobs

    par = dict(sleep_s=SLEEP_S)
    DIR ='mytmp'
    makedirs(DIR)
    f0 = makePypeLocalFile('mytmp/f0')
    f1 = makePypeLocalFile('mytmp/f1')
    make_task = PypeTask(
            inputs = {},
            outputs = {'f0': f0},
            parameters = par,
    )
    task = make_task(taskrun0)
    wf.addTasks([task])
    make_task = PypeTask(
            inputs = {'f0': f0},
            outputs = {'f1': f1},
            parameters = par,
    )
    task = make_task(taskrun1)
    wf.addTasks([task])
    wf.refreshTargets([task])
Пример #23
0
def main():
    lfn = 'logging-cfg.json'
    if os.path.exists(lfn):
        logging.config.dictConfig(json.load(open(lfn)))
    else:
        logging.basicConfig()
        logging.getLogger().setLevel(logging.NOTSET)
        try:
            import logging_tree
            logging_tree.printout()
        except ImportError:
            pass
    log.debug('DEBUG LOGGING ON')
    log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format(
        JOB_TYPE, SLEEP_S))
    exitOnFailure = False
    concurrent_jobs = 2
    Workflow = PypeProcWatcherWorkflow
    wf = Workflow(job_type=JOB_TYPE)
    wf.max_jobs = concurrent_jobs

    par = dict(sleep_s=SLEEP_S)
    DIR = 'mytmp'
    makedirs(DIR)
    f0 = makePypeLocalFile('mytmp/f0')
    f1 = makePypeLocalFile('mytmp/f1')
    make_task = PypeTask(
        inputs={},
        outputs={'f0': f0},
        parameters=par,
    )
    task = make_task(taskrun0)
    wf.addTasks([task])
    make_task = PypeTask(
        inputs={'f0': f0},
        outputs={'f1': f1},
        parameters=par,
    )
    task = make_task(taskrun1)
    wf.addTasks([task])
    wf.refreshTargets([task])
Пример #24
0
def get_read_hctg_map(asm_dir, hasm_dir, read_to_contig_map_fn):
    wf = PypeProcWatcherWorkflow(
        max_jobs=
        12,  # TODO: Why was NumThreads ever set? There is only one task!
    )

    rawread_id_file = makePypeLocalFile(
        os.path.join(asm_dir, 'read_maps/dump_rawread_ids/rawread_ids'))
    pread_id_file = makePypeLocalFile(
        os.path.join(asm_dir, 'read_maps/dump_pread_ids/pread_ids'))
    h_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_h_ctg_edges'))
    p_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_p_ctg_edges'))
    h_ctg_ids = makePypeLocalFile(os.path.join(hasm_dir, "all_h_ctg_ids"))
    #make_dirs(os.path.dirname(os.path.abspath(read_to_contig_map_fn)) # Workflow does this.

    read_to_contig_map_plf = makePypeLocalFile(read_to_contig_map_fn)

    inputs = {
        'rawread_id_file': rawread_id_file,
        'pread_id_file': pread_id_file,
        'h_ctg_edges': h_ctg_edges,
        'p_ctg_edges': p_ctg_edges,
        'h_ctg_ids': h_ctg_ids
    }

    make_task = PypeTask(
        inputs=inputs,
        outputs={'read_to_contig_map': read_to_contig_map_plf},
    )
    wf.addTask(make_task(generate_read_to_hctg_map))
    wf.refreshTargets()  # block
Пример #25
0
def run(
    wf,
    config,
):
    exitOnFailure = True
    #try:
    #    # Make it always re-run.
    #    os.remove('out.txt')
    #except Exception:
    #    LOG.exception('could not remove out.txt')
    o0 = makePypeLocalFile('hey0/out.txt')
    make_task = PypeTask(
        inputs={},
        outputs={'o0': o0},
        parameters={},
    )
    t0 = make_task(mymod.say_hey0)
    o1 = makePypeLocalFile('hey1/out.txt')
    make_task = PypeTask(
        inputs={'i0': o0},
        outputs={'o1': o1},
        parameters={},
    )
    t1 = make_task(mymod.say_hey1)
    wf.addTasks([t0, t1])  # for new-simple-way, we could add just t1
    N = int(os.environ.get('N', '1'))
    for i in range(N):
        make_task = PypeTask(
            inputs={},
            outputs={
                'out': 'touched',
            },
            #outputs = {'out': 'hey-{}/touched'.format(i),},
            parameters={},
            wdir='hey-{}'.format(i),
        )
        t = make_task(mymod.touchit)
        wf.addTask(t)
    wf.refreshTargets(exitOnFailure=exitOnFailure)
Пример #26
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global LOG
    LOG = support.setup_logger(logger_config_fn)
    lfs_setstripe_maybe(path='.', stripe=12)

    LOG.info('fc_run started with configuration %s', input_config_fn)
    try:
        config = support.parse_cfg_file(input_config_fn)
        import json
        dumped = json.dumps(config,
                            indent=2,
                            separators=(',', ': '),
                            sort_keys=True)
        LOG.info('cfg=\n{}'.format(dumped))
    except Exception:
        LOG.exception('Failed to parse config "{}".'.format(input_config_fn))
        raise
    general_config = config['General']
    assert 'input_fofn' in general_config, 'Missing "input_fofn" in {}.'.format(
        input_config_fn)
    input_fofn_plf = makePypeLocalFile(general_config['input_fofn'])
    genome_size = int(general_config.get('genome_size', '0'))
    squash = True if 0 < genome_size < 1000000 else False
    wf = PypeProcWatcherWorkflow(
        job_defaults=config['job.defaults'],
        squash=squash,
    )
    general_config['ver'] = '100'
    config_fn = './config.json'  # must not be in a task-dir
    io.serialize(config_fn, config)
    with open('foo.snake', 'w') as snakemake_writer:
        rule_writer = snakemake.SnakemakeRuleWriter(snakemake_writer)
        run(
            wf,
            config,
            rule_writer,
            os.path.abspath(config_fn),
            input_fofn_plf=input_fofn_plf,
        )
Пример #27
0
def unzip_all(config):
    unzip_blasr_concurrent_jobs = config['unzip_blasr_concurrent_jobs']
    unzip_phasing_concurrent_jobs = config['unzip_phasing_concurrent_jobs']
    wf = PypeProcWatcherWorkflow(
            max_jobs=unzip_blasr_concurrent_jobs,
            job_type=config['job_type'],
            job_queue=config.get('job_queue'),
            sge_option=config.get('sge_option'),
            watcher_type=config.get('pwatcher_type'),
            #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'),
            use_tmpdir=config.get('use_tmpdir'),
    )

    ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list')
    falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done')
    wdir = os.path.abspath('./3-unzip/reads')
    parameters = {'wd': wdir, 'config': config,
            'sge_option': config['sge_track_reads'],
    }
    job_done = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_done'))
    make_track_reads_task = PypeTask(inputs = {'falcon_asm_done': falcon_asm_done},
                                     outputs = {'job_done': job_done, 'ctg_list_file': ctg_list_file},
                                     parameters = parameters,
                                     wdir = wdir,
    )
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)
    wf.refreshTargets() #force refresh now, will put proper dependence later

    ctg_ids = []
    with open('./3-unzip/reads/ctg_list') as f:
        for row in f:
            row = row.strip()
            ctg_ids.append(row)

    aln1_outs = {}

    all_ctg_out = {}

    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
        read_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id = ctg_id))

        # outputs
        wd = os.path.join(os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id = ctg_id))
        #mkdir(wd)
        blasr_dir = os.path.join(wd, 'blasr')
        ctg_aln_out = makePypeLocalFile(os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id = ctg_id)))
        job_done = makePypeLocalFile(os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id = ctg_id)))

        parameters = {'job_uid':'aln-'+ctg_id, 'wd': blasr_dir, 'config':config, 'ctg_id': ctg_id,
                'sge_option': config['sge_blasr_aln'],
        }
        make_blasr_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_fasta': read_fasta},
                                   outputs = {'ctg_aln_out': ctg_aln_out, 'job_done': job_done},
                                   parameters = parameters,
        )
        blasr_task = make_blasr_task(task_run_blasr)
        aln1_outs[ctg_id] = (ctg_aln_out, job_done)
        wf.addTask(blasr_task)
    wf.refreshTargets()

    wf.max_jobs = unzip_phasing_concurrent_jobs
    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
        read_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id = ctg_id))

        # outputs
        wd = os.path.join(os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id = ctg_id))

        blasr_dir = os.path.join(wd, 'blasr')
        ctg_aln_out = makePypeLocalFile(os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id = ctg_id)))

        phasing_dir = os.path.join(wd, 'phasing')
        job_done = makePypeLocalFile(os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id = ctg_id)))
        rid_to_phase_out = makePypeLocalFile(os.path.join(wd, 'rid_to_phase.{ctg_id}'.format(ctg_id = ctg_id))) # TODO: ???
        all_ctg_out[ 'r2p.{ctg_id}'.format(ctg_id = ctg_id) ] = rid_to_phase_out # implicit output?

        parameters = {'job_uid':'ha-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id,
                'sge_option': config['sge_phasing'],
        }
        make_phasing_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'aln_bam':ctg_aln_out},
                                   outputs = {'job_done': job_done},
                                   parameters = parameters,
        )
        phasing_task = make_phasing_task(task_phasing)
        wf.addTask(phasing_task)
    wf.refreshTargets()

    hasm_wd = os.path.abspath('./3-unzip/1-hasm/')
    #mkdir(hasm_wd)
    rid_to_phase_all = makePypeLocalFile(os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all'))
    task = PypeTask(inputs = all_ctg_out, outputs = {'rid_to_phase_all': rid_to_phase_all},
    ) (get_rid_to_phase_all)
    wf.addTask(task)

    parameters['wd'] = hasm_wd
    parameters['sge_option'] = config['sge_hasm']
    job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done'))
    make_hasm_task = PypeTask(inputs = {'rid_to_phase_all': rid_to_phase_all},
                              outputs = {'job_done': job_done},
                              parameters = parameters,
    )
    hasm_task = make_hasm_task(task_hasm)

    wf.addTask(hasm_task)

    wf.refreshTargets()
Пример #28
0
def run(wf, config,
        input_config_fn,
        input_fofn_plf,
        ):
    """
    Preconditions (for now):
    * fc_run_logger
    * run_support.logger
    """
    rawread_dir = os.path.abspath('./0-rawreads')
    pread_dir = os.path.abspath('./1-preads_ovl')
    falcon_asm_dir  = os.path.abspath('./2-asm-falcon')
    script_dir = os.path.abspath('./scripts')
    sge_log_dir = os.path.abspath('./sge_log')

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
    concurrent_jobs = config['pa_concurrent_jobs']
    wf.max_jobs = concurrent_jobs

    rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, 'raw-fofn-abs', os.path.basename(config['input_fofn'])))
    make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf},
                                  outputs = {'o_fofn': rawread_fofn_plf},
                                  parameters = {},
    )
    fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config['input_type'] == 'raw':
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile( os.path.join( rawread_dir, 'sleep_done') )
        rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, 'rdb_build_done') )
        run_jobs = makePypeLocalFile( os.path.join( rawread_dir, 'run_jobs.sh') )
        parameters = {'work_dir': rawread_dir,
                      'sge_option': config['sge_option_da'],
                      'config_fn': input_config_fn,
                      'config': config}

        length_cutoff_plf = makePypeLocalFile(os.path.join(rawread_dir, 'length_cutoff'))
        raw_reads_db_plf = makePypeLocalFile(os.path.join(rawread_dir, '%s.db' % 'raw_reads'))
        make_build_rdb_task = PypeTask(inputs = {'input_fofn': rawread_fofn_plf},
                                      outputs = {'rdb_build_done': rdb_build_done,
                                                 'raw_reads_db': raw_reads_db_plf,
                                                 'length_cutoff': length_cutoff_plf,
                                                 'run_jobs': run_jobs,
                                      },
                                      parameters = parameters,
        )
        build_rdb_task = make_build_rdb_task(pype_tasks.task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done])

        raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf))
        #### run daligner
        scattered_plf = os.path.join(rawread_dir, 'daligner-scatter', 'scattered.json')
        make_daligner_scatter = PypeTask(
                inputs = {
                    'run_jobs_fn': run_jobs,
                    'db_build_done': rdb_build_done,
                },
                outputs = {
                    'scatter_fn': scattered_plf,
                },
                parameters = {
                    'db_prefix': 'raw_reads',
                    'nblock': raw_reads_nblock,
                    'pread_aln': False,
                    'config': config,
                },
        )
        task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        daligner_tasks, daligner_out = create_daligner_tasks(rawread_dir, scattered_plf)

        wf.addTasks(daligner_tasks)
        r_gathered_las_plf = makePypeLocalFile(os.path.join(rawread_dir, 'raw-gather', 'gathered_las.txt'))

        parameters =  {
                'nblock': raw_reads_nblock,
        }
        make_daligner_gather = PypeTask(
                   inputs = daligner_out,
                   outputs =  {'gathered': r_gathered_las_plf},
                   parameters = parameters,
        )
        check_r_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
        wf.addTask(check_r_da_task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        # Merge .las files.
        scattered_plf = os.path.join(rawread_dir, 'merge-scatter', 'scattered.json')
        make_task = PypeTask(
                inputs = {
                    'run_jobs': run_jobs,
                    'gathered_las': r_gathered_las_plf,
                },
                outputs = {
                    'scattered': scattered_plf,
                },
                parameters = {
                    'db_prefix': 'raw_reads',
                    'config': config,
                },
        )
        task = make_task(pype_tasks.task_merge_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        merge_tasks, p_ids_merged_las = create_merge_tasks(rawread_dir, scattered_plf)
        wf.addTasks(merge_tasks)
        task, _, las_fopfn_plf = create_merge_gather_task(os.path.join(rawread_dir, 'merge-gather'), p_ids_merged_las)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        if config['target'] == 'overlapping':
            sys.exit(0)

        # Produce new FOFN of preads fasta, based on consensus of overlaps.
        scattered_plf = os.path.join(rawread_dir, 'cns-scatter', 'scattered.json')
        make_task = PypeTask(
                inputs = {
                    'gathered': las_fopfn_plf,
                    'db': raw_reads_db_plf,
                },
                outputs = {
                    'scattered': scattered_plf,
                },
                parameters = {
                    'db_prefix': 'raw_reads',
                    'config': config,
                },
        )
        task = make_task(pype_tasks.task_consensus_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        tasks, consensus_out = create_consensus_tasks(rawread_dir, scattered_plf)
        wf.addTasks(tasks)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        task, preads_fofn_plf = create_consensus_gather_task(os.path.join(rawread_dir, 'preads'), consensus_out)
        wf.addTask(task)

        rdir = os.path.join(rawread_dir, 'report')
        pre_assembly_report_plf = makePypeLocalFile(os.path.join(rdir, 'pre_assembly_stats.json'))
        parameters = dict(config)
        parameters['cwd'] = rdir
        make_task = PypeTask(
                inputs = {'length_cutoff_fn': length_cutoff_plf,
                          'raw_reads_db': raw_reads_db_plf,
                          'preads_fofn': preads_fofn_plf, },
                outputs = {'pre_assembly_report': pre_assembly_report_plf, },
                parameters = parameters,
        )
        task = make_task(pype_tasks.task_report_pre_assembly)
        wf.addTask(task)

        concurrent_jobs = config['cns_concurrent_jobs']
        wf.max_jobs = concurrent_jobs
        wf.refreshTargets(exitOnFailure=exitOnFailure)


    if config['target'] == 'pre-assembly':
        log.info('Quitting after stage-0 for "pre-assembly" target.')
        sys.exit(0)

    # build pread database
    if config['input_type'] == 'preads':
        preads_fofn_plf = makePypeLocalFile(os.path.join(pread_dir, 'preads-fofn-abs', os.path.basename(config['input_fofn'])))
        make_fofn_abs_task = PypeTask(inputs = {'i_fofn': rawread_fofn_plf},
                                     outputs = {'o_fofn': preads_fofn_plf},
                                     parameters = {},
        )
        fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, 'pdb_build_done') )
    parameters = {'work_dir': pread_dir,
                  'sge_option': config['sge_option_pda'],
                  'config_fn': input_config_fn,
                  'config': config}

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    preads_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) # Also .preads.*, of course.
    make_build_pdb_task  = PypeTask(inputs = {'preads_fofn': preads_fofn_plf },
                                    outputs = {'pdb_build_done': pdb_build_done,
                                               'preads_db': preads_db,
                                               'run_jobs': run_jobs,
                                    },
                                    parameters = parameters,
    )
    build_pdb_task = make_build_pdb_task(pype_tasks.task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done])


    preads_nblock = support.get_nblock(fn(preads_db))
    #### run daligner
    config['sge_option_da'] = config['sge_option_pda']

    scattered_plf = os.path.join(pread_dir, 'daligner-scatter', 'scattered.json')
    make_daligner_scatter = PypeTask(
            inputs = {
                'run_jobs_fn': run_jobs,
                'db_build_done': pdb_build_done,
            },
            outputs = {
                'scatter_fn': scattered_plf,
            },
            parameters = {
                'db_prefix': 'preads',
                'nblock': preads_nblock,
                'pread_aln': True,
                'config': config,
            },
    )
    task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
    wf.addTask(task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    daligner_tasks, daligner_out = create_daligner_tasks(pread_dir, scattered_plf)
    wf.addTasks(daligner_tasks)

    p_gathered_las_plf = makePypeLocalFile(os.path.join(pread_dir, 'gathered-las', 'gathered-las.txt'))
    parameters =  {
            'nblock': preads_nblock,
    }
    make_daligner_gather = PypeTask(
                inputs = daligner_out,
                outputs =  {'gathered': p_gathered_las_plf},
                parameters = parameters,
    )
    check_p_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
    wf.addTask(check_p_da_task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    # Merge .las files.
    config['sge_option_la'] = config['sge_option_pla']
    scattered_plf = os.path.join(pread_dir, 'merge-scatter', 'scattered.json')
    make_task = PypeTask(
            inputs = {
                'run_jobs': run_jobs,
                'gathered_las': p_gathered_las_plf,
            },
            outputs = {
                'scattered': scattered_plf,
            },
            parameters = {
                'db_prefix': 'preads',
                'config': config,
            },
    )
    task = make_task(pype_tasks.task_merge_scatter)
    wf.addTask(task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    merge_tasks, p_ids_merged_las = create_merge_tasks(pread_dir, scattered_plf)
    wf.addTasks(merge_tasks)
    task, las_fofn_plf, las_fopfn_plf = create_merge_gather_task(os.path.join(pread_dir, 'merge-gather'), p_ids_merged_las)
    wf.addTask(task)

    concurrent_jobs = config['ovlp_concurrent_jobs']
    wf.max_jobs = concurrent_jobs

    wf.refreshTargets(exitOnFailure=exitOnFailure)


    db2falcon_dir = os.path.join(pread_dir, 'db2falcon')
    db2falcon_done = makePypeLocalFile(os.path.join(db2falcon_dir, 'db2falcon_done'))
    preads4falcon_plf = makePypeLocalFile(os.path.join(db2falcon_dir, 'preads4falcon.fasta'))
    make_run_db2falcon = PypeTask(
               inputs = {'las_fofn_plf': las_fofn_plf,
                         'preads_db': preads_db,
                        },
               outputs =  {'db2falcon_done': db2falcon_done,
                           'preads4falcon': preads4falcon_plf,
                          },
               parameters = {'wd': db2falcon_dir,
                             'config': config,
                             'sge_option': config['sge_option_fc'],
                            },
    )
    wf.addTask(make_run_db2falcon(pype_tasks.task_run_db2falcon))

    falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, 'falcon_asm_done'))
    make_run_falcon_asm = PypeTask(
               inputs = {'db2falcon_done': db2falcon_done, 'db_file': preads_db,
                         'preads4falcon': preads4falcon_plf,
                         'las_fofn': las_fofn_plf,
                        },
               outputs =  {'falcon_asm_done': falcon_asm_done},
               parameters = {'wd': falcon_asm_dir,
                             'config': config,
                             'pread_dir': pread_dir,
                             'sge_option': config['sge_option_fc'],
               },
    )
    wf.addTask(make_run_falcon_asm(pype_tasks.task_run_falcon_asm))
    wf.refreshTargets()

    return falcon_asm_done
Пример #29
0
def flow(config):
    #import pdb; pdb.set_trace()
    parameters = config
    #exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
    #wf.refreshTargets(exitOnFailure=exitOnFailure)
    #wf = PypeThreadWorkflow()
    #wf = PypeWorkflow()
    #wf = PypeWorkflow(job_type='local')
    log.debug('config=\n{}'.format(pprint.pformat(config)))
    # Set some defaults on the Workflow.
    concurrent_jobs = 24  # TODO: Configure this.
    wf = PypeWorkflow(
        job_type=config['hgap'].get('job_type'),
        job_queue=config['hgap'].get('job_queue'),
        watcher_type=config['hgap'].get('pwatcher_type', 'blocking'),
        #watcher_directory=config['pwatcher_directory'],
        max_jobs=config['hgap'].get('max_jobs', concurrent_jobs),
    )

    use_tmpdir = config['hgap'].get('use_tmpdir')
    if use_tmpdir:
        log.info('hgap.use_tmpdir={!r}'.format(use_tmpdir))
        if use_tmpdir is not True and '/' in use_tmpdir:
            tempfile.tempdir = use_tmpdir
            log.info('Using tempfile.tempdir={}'.format(tempfile.tempdir))
        else:
            log.info('Keeping tempfile.tempdir={}'.format(tempfile.tempdir))

    dataset_pfn = makePypeLocalFile(config['pbsmrtpipe']['input_files'][0])
    filtered_pfn = makePypeLocalFile('run-filterbam/filtered.subreadset.xml')
    make_task = PypeTask(
        inputs={
            "dataset": dataset_pfn,
        },
        outputs={
            "filtered": filtered_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_filterbam)
    wf.addTask(task)

    split_subreadsets_fofn_pfn = makePypeLocalFile(
        'run-bam_scatter/chunked_subreadsets.fofn')
    make_task = PypeTask(
        inputs={
            "dataset": filtered_pfn,
        },
        outputs={
            "split_subreadsets_fofn": split_subreadsets_fofn_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_bam_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    tasks, input_fofn_pfn = create_tasks_fasta2DB(split_subreadsets_fofn_pfn,
                                                  parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    fc_cfg_pfn = makePypeLocalFile('run-falcon/fc.cfg')
    fc_json_config_pfn = makePypeLocalFile('run-falcon/fc.json')
    make_task = PypeTask(
        inputs={
            "input_fofn": input_fofn_pfn,
        },
        outputs={
            "fc_cfg": fc_cfg_pfn,
            "fc_json_config": fc_json_config_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_prepare_falcon)
    wf.addTask(task)
    wf.refreshTargets()

    input_config_fn = fn(fc_cfg_pfn)
    with sys.cd('run-falcon'):
        falcon_kit.mains.run1.fc_run_logger = falcon_kit.run_support.logger = logging.getLogger(
            'falcon')
        fc_cfg = falcon_kit.run_support.get_dict_from_old_falcon_cfg(
            falcon_kit.run_support.parse_config(input_config_fn))
        # FALCON takes over the workflow for a while.
        # (For debugging, it is still possible to restart just fc_run, if desired.)
        falcon_asm_done_pfn = falcon_kit.mains.run1.run(
            wf,
            fc_cfg,
            input_config_fn,
            input_fofn_plf=input_fofn_pfn,  # _pfn should be _plf, but oh well
        )
        wf.max_jobs = concurrent_jobs  # in case Falcon changed this

    # Here is a hard-linking task to help us attach falcon into the dependency graph.
    falcon_link_done_pfn = makePypeLocalFile(
        'run-falcon_link/falcon_link_done')
    make_task = PypeTask(
        inputs={
            "falcon_asm_done": falcon_asm_done_pfn,
        },
        outputs={
            "falcon_link_done": falcon_link_done_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_falcon_link)
    wf.addTask(task)

    # The rest of the workflow will operate on datasets, not fasta directly.
    referenceset_pfn = makePypeLocalFile(
        'run-fasta2referenceset/asm.referenceset.xml')
    make_task = PypeTask(
        inputs={
            "falcon_link_done": falcon_link_done_pfn,
        },
        outputs={
            "referenceset": referenceset_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_fasta2referenceset)
    wf.addTask(task)
    wf.refreshTargets()

    # scatter the subreads for pbalign
    """Produces:
    pbalign_chunk.json
    chunk_subreadset_*.subreadset.xml
    """
    pbalign_chunk_json_pfn = makePypeLocalFile(
        'run-pbalign-scatter/pbalign_chunk.json')
    make_task = PypeTask(
        inputs={
            "dataset": dataset_pfn,
            "referenceset": referenceset_pfn,
        },
        outputs={
            "out_json": pbalign_chunk_json_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_pbalign_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    # After scattering, we can specify the pbalign jobs.
    tasks, alignmentset_pfn = create_tasks_pbalign(pbalign_chunk_json_pfn,
                                                   referenceset_pfn,
                                                   parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    # scatter the alignmentset for genomic_consensus (variantCaller)
    """Produces:
    gc.chunks.fofn
    ???*.congitset.xml ???
    """
    gc_chunks_fofn_pfn = makePypeLocalFile('run-gc_scatter/gc.chunks.fofn')
    make_task = PypeTask(
        inputs={
            "alignmentset": alignmentset_pfn,
            "referenceset": referenceset_pfn,
        },
        outputs={
            "out_fofn": gc_chunks_fofn_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_gc_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    tasks, contigset_pfn, gathered_fastq_pfn = create_tasks_gc(
        gc_chunks_fofn_pfn, referenceset_pfn, parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    # Final report

    polished_assembly_report_json_pfn = makePypeLocalFile(
        'run-polished-assembly-report/polished_assembly_report.json')
    make_task = PypeTask(
        inputs={
            "referenceset": referenceset_pfn,
            "gathered_alignmentset": alignmentset_pfn,
            "polished_fastq": gathered_fastq_pfn,
        },
        outputs={
            "report_json": polished_assembly_report_json_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_polished_assembly_report)
    wf.addTask(task)

    wf.refreshTargets()

    par_dir = os.path.dirname(fn(polished_assembly_report_json_pfn))
    sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality.png'))
    sys.symlink(os.path.join(par_dir,
                             'polished_coverage_vs_quality_thumb.png'))
    #return
    ##############

    if not os.path.exists('foo.bar1'):
        sys.system('touch foo.bar1')
    foo_fn1 = makePypeLocalFile('foo.bar1')
    foo_fn2 = makePypeLocalFile('foo.bar2')
    make_task = PypeTask(
        inputs={
            "foo1": foo_fn1,
        },
        outputs={
            "foo2": foo_fn2,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_foo)
    wf.addTask(task)
    wf.refreshTargets()
Пример #30
0
def create_tasks_gc(fofn_pfn, referenceset_pfn, parameters):
    """Create a gc task for each chunk, plus a gathering task.
    Here is the convoluted workflow:
    1. For each gc instance "chunk":
      A. variantCaller writes .fasta
      B. We create a contigset for the .fasta
    2. We keep the contigset output filenames in a FOFN (from run_gc_scatter)
       and pass that to run_gc_gather().
    3. We read each contigset and add them to a gathered ContigSet.
    4. We "consolidate" their underlying .fasta "resources",
       assuming their filenames match except extenion.
    5. Finally, we write the gathered contigset.
    Whew!
    We also gather fastq here, for convenience.
    """
    tasks = list()
    contigsets = dict()
    fastqs = dict()
    # Assume fofn of gc chunks are all relative to the dir of the fofn.
    for i, alignmentset_bn in enumerate(open(fn(fofn_pfn)).read().split()):
        alignmentset_fn = os.path.join(os.path.dirname(fn(fofn_pfn)),
                                       alignmentset_bn)
        wdir = 'run-gc-{:02}'.format(i)
        mkdirs(wdir)  # Assume CWD is correct.
        alignmentset_pfn = makePypeLocalFile(
            alignmentset_fn)  # New pfn cuz it was not pfn before.
        polished_fastq_pfn = makePypeLocalFile(
            os.path.join(wdir, 'consensus.fastq'))
        variants_gff_pfn = makePypeLocalFile(os.path.join(
            wdir, 'variants.gff'))
        consensus_contigset_pfn = makePypeLocalFile(
            os.path.join(wdir, 'consensus.contigset.xml'))
        """Also produces:
        consensus.fasta
        consensus.fasta.fai

        And note that these files names are important, as pbcoretools gathering expects
        a particular pattern.
        """
        contigsets['contigset_{:02d}'.format(i)] = consensus_contigset_pfn
        fastqs['fastq_{:02d}'.format(i)] = polished_fastq_pfn
        make_task = PypeTask(
            inputs={
                "alignmentset": alignmentset_pfn,
                "referenceset": referenceset_pfn,
            },
            outputs={
                "polished_fastq": polished_fastq_pfn,
                "variants_gff": variants_gff_pfn,
                "consensus_contigset": consensus_contigset_pfn,
            },
            parameters=parameters,
        )
        task = make_task(start_task.task_genomic_consensus)
        tasks.append(task)
    contigset_pfn = makePypeLocalFile('run-gc-gather/contigset.xml')
    gathered_fastq_pfn = makePypeLocalFile('run-gc-gather/gathered.fastq')
    inputs = dict(contigsets)
    inputs.update(fastqs)
    log.debug('inputs to gc_gather:{}'.format(pprint.pformat(contigsets)))
    make_task = PypeTask(
        inputs=inputs,
        outputs={
            "ds_out": contigset_pfn,
            "fastq_out": gathered_fastq_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_gc_gather)
    tasks.append(task)
    return tasks, contigset_pfn, gathered_fastq_pfn
Пример #31
0
def main(argv=sys.argv):
    global LOG
    LOG = support.setup_logger(None)


    if len(sys.argv) < 2:
        print>>sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
        sys.exit(1)

    config_fn = sys.argv[1]
    config_absbasedir = os.path.dirname(os.path.abspath(config_fn))

    config = ConfigParser.ConfigParser()
    config.read(config_fn)


    job_type = 'SGE'
    if config.has_option('General', 'job_type'):
        job_type = config.get('General', 'job_type')

    job_queue = 'default'
    if config.has_option('General', 'job_queue'):
        job_queue = config.get('General', 'job_queue')

    pwatcher_type = 'fs_based'
    if config.has_option('General', 'pwatcher_type'):
        pwatcher_type = config.get('General', 'pwatcher_type')

    sge_track_reads = ' -pe smp 12 -q bigmem'
    if config.has_option('Unzip', 'sge_track_reads'):
        sge_track_reads = config.get('Unzip', 'sge_track_reads')

    sge_quiver = ' -pe smp 24 -q bigmem '
    if config.has_option('Unzip', 'sge_quiver'):
        sge_quiver = config.get('Unzip', 'sge_quiver')

    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
    if config.has_option('Unzip', 'smrt_bin'):
        smrt_bin = config.get('Unzip', 'smrt_bin')

    input_bam_fofn = 'input_bam.fofn'
    if config.has_option('Unzip', 'input_bam_fofn'):
        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
    if not os.path.isabs(input_bam_fofn):
        input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)


    quiver_concurrent_jobs = 8
    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
        quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs')

    config = {'job_type': job_type,
              'job_queue': job_queue,
              'sge_quiver': sge_quiver,
              'sge_track_reads': sge_track_reads,
              'input_bam_fofn': input_bam_fofn,
              'pwatcher_type': pwatcher_type,
              'smrt_bin': smrt_bin}
    LOG.info('config={}'.format(pprint.pformat(config)))

    #support.job_type = 'SGE' #tmp hack until we have a configuration parser


    wf = PypeProcWatcherWorkflow(
            max_jobs=quiver_concurrent_jobs,
            job_type=config['job_type'],
            job_queue=config.get('job_queue'),
            sge_option=config.get('sge_option'),
            watcher_type=config.get('pwatcher_type'),
            #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'),
            use_tmpdir=config.get('use_tmpdir'),
    )

    abscwd = os.path.abspath('.')
    parameters = {
            'sge_option': config['sge_track_reads'],
    }
    input_bam_fofn_fn = config['input_bam_fofn']
    input_bam_fofn_plf = makePypeLocalFile(input_bam_fofn_fn)
    hasm_done_plf = makePypeLocalFile('./3-unzip/1-hasm/hasm_done') # by convention
    track_reads_h_done_plf = makePypeLocalFile('./4-quiver/reads/track_reads_h_done')
    make_track_reads_task = PypeTask(inputs = {
                                       'input_bam_fofn': input_bam_fofn_plf,
                                       'hasm_done': hasm_done_plf},
                                     outputs = {'job_done': track_reads_h_done_plf},
                                     parameters = parameters,
    )
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)

    scattered_quiver_plf = makePypeLocalFile('4-quiver/quiver_scatter/scattered.json')
    parameters = {
            'config': config,
    }
    make_task = PypeTask(
            inputs = {
                'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
                'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
                'track_reads_h_done': track_reads_h_done_plf,
            },
            outputs = {
                'scattered_quiver_json': scattered_quiver_plf,
            },
            parameters = parameters,
    )
    wf.addTask(make_task(task_scatter_quiver))
    wf.refreshTargets()

    p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(wf, scattered_quiver_plf)

    gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
    gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
    gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
    mkdir('4-quiver/cns_gather')
    with open(fn(gathered_p_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
    with open(fn(gathered_h_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))

    make_task = PypeTask(
            inputs = job_done_plfs,
            outputs = {
                'job_done': gather_done_plf,
            },
            parameters = {},
    )
    wf.addTask(make_task(task_gather_quiver))
    wf.refreshTargets()

    cns_p_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fasta')
    cns_p_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fastq')
    cns_h_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fasta')
    cns_h_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fastq')
    zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
    make_task = PypeTask(
            inputs = {
                'gathered_p_ctg': gathered_p_ctg_plf,
                'gathered_h_ctg': gathered_h_ctg_plf,
                'gather_done': gather_done_plf,
            },
            outputs = {
                'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
                'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
                'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
                'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
                'job_done': zcat_done_plf,
            },
    )
    wf.addTask(make_task(task_cns_zcat))

    wf.refreshTargets()
Пример #32
0
def make_dirs(d):
    if not os.path.isdir(d):
        os.makedirs(d)

rawread_dir = os.path.abspath( "./0-rawreads" )
pread_dir = os.path.abspath( "./1-preads_ovl" )
asm_dir = os.path.abspath( os.path.join("./3-unzip/") )

read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
make_dirs(read_map_dir)

wf = PypeProcWatcherWorkflow(
        max_jobs=12,
)

rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) )
rawread_id_file = makePypeLocalFile( os.path.join( rawread_dir, "raw_read_ids" ) )

@PypeTask( inputs = {"rawread_db": rawread_db}, 
           outputs =  {"rawread_id_file": rawread_id_file},
           TaskType = PypeThreadTaskBase,
           URL = "task://localhost/dump_rawread_ids" )
def dump_rawread_ids(self):
    rawread_db = fn( self.rawread_db )
    rawread_id_file = fn( self.rawread_id_file )
    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )

wf.addTask( dump_rawread_ids )

pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) )
pread_id_file = makePypeLocalFile( os.path.join( pread_dir, "pread_ids" ) )
Пример #33
0
def flow(config):
    #import pdb; pdb.set_trace()
    parameters = config
    #exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
    #wf.refreshTargets(exitOnFailure=exitOnFailure)
    #wf = PypeThreadWorkflow()
    #wf = PypeWorkflow()
    #wf = PypeWorkflow(job_type='local')
    log.debug('config=\n{}'.format(pprint.pformat(config)))
    # Set some defaults on the Workflow.
    concurrent_jobs = 24 # TODO: Configure this.
    wf = PypeWorkflow(
            job_type=config['hgap'].get('job_type'),
            job_queue=config['hgap'].get('job_queue'),
            watcher_type=config['hgap'].get('pwatcher_type', 'blocking'),
            #watcher_directory=config['pwatcher_directory'],
            max_jobs=config['hgap'].get('max_jobs', concurrent_jobs),
    )

    use_tmpdir = config['hgap'].get('use_tmpdir')
    if use_tmpdir:
        log.info('hgap.use_tmpdir={!r}'.format(use_tmpdir))
        if use_tmpdir is not True and '/' in use_tmpdir:
            tempfile.tempdir = use_tmpdir
            log.info('Using tempfile.tempdir={}'.format(tempfile.tempdir))
        else:
            log.info('Keeping tempfile.tempdir={}'.format(tempfile.tempdir))

    dataset_pfn = makePypeLocalFile(config['pbsmrtpipe']['input_files'][0])
    filtered_pfn = makePypeLocalFile('run-filterbam/filtered.subreadset.xml')
    make_task = PypeTask(
            inputs = {"dataset": dataset_pfn, },
            outputs = {"filtered": filtered_pfn, },
            parameters = parameters,
    )
    task = make_task(start_task.task_filterbam)
    wf.addTask(task)

    split_subreadsets_fofn_pfn = makePypeLocalFile('run-bam_scatter/chunked_subreadsets.fofn')
    make_task = PypeTask(
            inputs = {"dataset": filtered_pfn, },
            outputs =  {"split_subreadsets_fofn": split_subreadsets_fofn_pfn, },
            parameters = parameters,
    )
    task = make_task(start_task.task_bam_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    tasks, input_fofn_pfn = create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    fc_cfg_pfn = makePypeLocalFile('run-falcon/fc.cfg')
    fc_json_config_pfn = makePypeLocalFile('run-falcon/fc.json')
    make_task = PypeTask(
            inputs = {
                      "input_fofn": input_fofn_pfn,
            },
            outputs = {"fc_cfg": fc_cfg_pfn,
                       "fc_json_config": fc_json_config_pfn,
            },
            parameters = parameters,
    )
    task = make_task(start_task.task_prepare_falcon)
    wf.addTask(task)
    wf.refreshTargets()

    input_config_fn = fn(fc_cfg_pfn)
    with sys.cd('run-falcon'):
        falcon_kit.mains.run1.fc_run_logger = falcon_kit.run_support.logger = logging.getLogger('falcon')
        fc_cfg = falcon_kit.run_support.get_dict_from_old_falcon_cfg(
                falcon_kit.run_support.parse_config(input_config_fn))
        # FALCON takes over the workflow for a while.
        # (For debugging, it is still possible to restart just fc_run, if desired.)
        falcon_asm_done_pfn = falcon_kit.mains.run1.run(wf, fc_cfg,
                input_config_fn,
                input_fofn_plf=input_fofn_pfn, # _pfn should be _plf, but oh well
        )
        wf.max_jobs = concurrent_jobs # in case Falcon changed this

    # Here is a hard-linking task to help us attach falcon into the dependency graph.
    falcon_link_done_pfn = makePypeLocalFile('run-falcon_link/falcon_link_done')
    make_task = PypeTask(
            inputs = {"falcon_asm_done": falcon_asm_done_pfn,},
            outputs = {
                       "falcon_link_done": falcon_link_done_pfn,
            },
            parameters = parameters,
    )
    task = make_task(start_task.task_falcon_link)
    wf.addTask(task)

    # The rest of the workflow will operate on datasets, not fasta directly.
    referenceset_pfn = makePypeLocalFile('run-fasta2referenceset/asm.referenceset.xml')
    make_task = PypeTask(
            inputs =  {"falcon_link_done": falcon_link_done_pfn,},
            outputs = {"referenceset": referenceset_pfn,},
            parameters = parameters,
    )
    task = make_task(start_task.task_fasta2referenceset)
    wf.addTask(task)
    wf.refreshTargets()

    # scatter the subreads for pbalign
    """Produces:
    pbalign_chunk.json
    chunk_subreadset_*.subreadset.xml
    """
    pbalign_chunk_json_pfn = makePypeLocalFile('run-pbalign-scatter/pbalign_chunk.json')
    make_task = PypeTask(
            inputs = {"dataset": dataset_pfn,
                      "referenceset": referenceset_pfn,},
            outputs = {"out_json": pbalign_chunk_json_pfn,},
            parameters = parameters,
    )
    task = make_task(start_task.task_pbalign_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    # After scattering, we can specify the pbalign jobs.
    tasks, alignmentset_pfn = create_tasks_pbalign(pbalign_chunk_json_pfn, referenceset_pfn, parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    # scatter the alignmentset for genomic_consensus (variantCaller)
    """Produces:
    gc.chunks.fofn
    ???*.congitset.xml ???
    """
    gc_chunks_fofn_pfn = makePypeLocalFile('run-gc_scatter/gc.chunks.fofn')
    make_task = PypeTask(
            inputs = {"alignmentset": alignmentset_pfn,
                      "referenceset": referenceset_pfn,},
            outputs = {"out_fofn": gc_chunks_fofn_pfn,},
            parameters = parameters,
    )
    task = make_task(start_task.task_gc_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    tasks, contigset_pfn, gathered_fastq_pfn = create_tasks_gc(gc_chunks_fofn_pfn, referenceset_pfn, parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()


    # Final report

    polished_assembly_report_json_pfn = makePypeLocalFile('run-polished-assembly-report/polished_assembly_report.json')
    make_task = PypeTask(
            inputs = {"referenceset": referenceset_pfn,
                      "gathered_alignmentset": alignmentset_pfn,
                      "polished_fastq": gathered_fastq_pfn,},
            outputs = {"report_json": polished_assembly_report_json_pfn,},
            parameters = parameters,
    )
    task = make_task(start_task.task_polished_assembly_report)
    wf.addTask(task)

    wf.refreshTargets()

    par_dir = os.path.dirname(fn(polished_assembly_report_json_pfn))
    sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality.png'))
    sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality_thumb.png'))
    #return
    ##############

    if not os.path.exists('foo.bar1'):
        sys.system('touch foo.bar1')
    foo_fn1 = makePypeLocalFile('foo.bar1')
    foo_fn2 = makePypeLocalFile('foo.bar2')
    make_task = PypeTask(
            inputs = {"foo1": foo_fn1,},
            outputs =  {"foo2": foo_fn2,},
            parameters = parameters,
    )
    task = make_task(start_task.task_foo)
    wf.addTask(task)
    wf.refreshTargets()
Пример #34
0
def create_tasks_gc(fofn_pfn, referenceset_pfn, parameters):
    """Create a gc task for each chunk, plus a gathering task.
    Here is the convoluted workflow:
    1. For each gc instance "chunk":
      A. variantCaller writes .fasta
      B. We create a contigset for the .fasta
    2. We keep the contigset output filenames in a FOFN (from run_gc_scatter)
       and pass that to run_gc_gather().
    3. We read each contigset and add them to a gathered ContigSet.
    4. We "consolidate" their underlying .fasta "resources",
       assuming their filenames match except extenion.
    5. Finally, we write the gathered contigset.
    Whew!
    We also gather fastq here, for convenience.
    """
    tasks = list()
    contigsets = dict()
    fastqs = dict()
    # Assume fofn of gc chunks are all relative to the dir of the fofn.
    for i, alignmentset_bn in enumerate(open(fn(fofn_pfn)).read().split()):
        alignmentset_fn = os.path.join(os.path.dirname(fn(fofn_pfn)), alignmentset_bn)
        wdir = 'run-gc-{:02}'.format(i)
        mkdirs(wdir) # Assume CWD is correct.
        alignmentset_pfn = makePypeLocalFile(alignmentset_fn) # New pfn cuz it was not pfn before.
        polished_fastq_pfn = makePypeLocalFile(os.path.join(wdir, 'consensus.fastq'))
        variants_gff_pfn = makePypeLocalFile(os.path.join(wdir, 'variants.gff'))
        consensus_contigset_pfn = makePypeLocalFile(os.path.join(wdir, 'consensus.contigset.xml'))
        """Also produces:
        consensus.fasta
        consensus.fasta.fai

        And note that these files names are important, as pbcoretools gathering expects
        a particular pattern.
        """
        contigsets['contigset_{:02d}'.format(i)] = consensus_contigset_pfn
        fastqs['fastq_{:02d}'.format(i)] = polished_fastq_pfn
        make_task = PypeTask(
                inputs = {"alignmentset": alignmentset_pfn,
                          "referenceset": referenceset_pfn,},
                outputs = {
                    "polished_fastq": polished_fastq_pfn,
                    "variants_gff": variants_gff_pfn,
                    "consensus_contigset": consensus_contigset_pfn,
                },
                parameters = parameters,
        )
        task = make_task(start_task.task_genomic_consensus)
        tasks.append(task)
    contigset_pfn = makePypeLocalFile('run-gc-gather/contigset.xml')
    gathered_fastq_pfn = makePypeLocalFile('run-gc-gather/gathered.fastq')
    inputs = dict(contigsets)
    inputs.update(fastqs)
    log.debug('inputs to gc_gather:{}'.format(pprint.pformat(contigsets)))
    make_task = PypeTask(
            inputs = inputs,
            outputs = {"ds_out": contigset_pfn,
                       "fastq_out": gathered_fastq_pfn,
            },
            parameters = parameters,
    )
    task = make_task(start_task.task_gc_gather)
    tasks.append(task)
    return tasks, contigset_pfn, gathered_fastq_pfn
Пример #35
0
def unzip_all(config):
    unzip_blasr_concurrent_jobs = config['unzip_blasr_concurrent_jobs']
    unzip_phasing_concurrent_jobs = config['unzip_phasing_concurrent_jobs']
    wf = PypeProcWatcherWorkflow(
        max_jobs=unzip_blasr_concurrent_jobs,
        job_type=config['job_type'],
        job_queue=config.get('job_queue'),
        sge_option=config.get('sge_option'),
        watcher_type=config.get('pwatcher_type'),
        #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'),
        use_tmpdir=config.get('use_tmpdir'),
    )

    ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list')
    falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done')
    wdir = os.path.abspath('./3-unzip/reads')
    parameters = {
        'wd': wdir,
        'config': config,
        'sge_option': config['sge_track_reads'],
    }
    job_done = makePypeLocalFile(
        os.path.join(parameters['wd'], 'track_reads_done'))
    make_track_reads_task = PypeTask(
        inputs={'falcon_asm_done': falcon_asm_done},
        outputs={
            'job_done': job_done,
            'ctg_list_file': ctg_list_file
        },
        parameters=parameters,
        wdir=wdir,
    )
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)
    wf.refreshTargets()  #force refresh now, will put proper dependence later

    ctg_ids = []
    with open('./3-unzip/reads/ctg_list') as f:
        for row in f:
            row = row.strip()
            ctg_ids.append(row)

    aln1_outs = {}

    all_ctg_out = {}

    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id=ctg_id))
        read_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id=ctg_id))

        # outputs
        wd = os.path.join(
            os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id=ctg_id))
        #mkdir(wd)
        blasr_dir = os.path.join(wd, 'blasr')
        ctg_aln_out = makePypeLocalFile(
            os.path.join(blasr_dir,
                         '{ctg_id}_sorted.bam'.format(ctg_id=ctg_id)))
        job_done = makePypeLocalFile(
            os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id=ctg_id)))

        parameters = {
            'job_uid': 'aln-' + ctg_id,
            'wd': blasr_dir,
            'config': config,
            'ctg_id': ctg_id,
            'sge_option': config['sge_blasr_aln'],
        }
        make_blasr_task = PypeTask(
            inputs={
                'ref_fasta': ref_fasta,
                'read_fasta': read_fasta
            },
            outputs={
                'ctg_aln_out': ctg_aln_out,
                'job_done': job_done
            },
            parameters=parameters,
        )
        blasr_task = make_blasr_task(task_run_blasr)
        aln1_outs[ctg_id] = (ctg_aln_out, job_done)
        wf.addTask(blasr_task)
    wf.refreshTargets()

    wf.max_jobs = unzip_phasing_concurrent_jobs
    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id=ctg_id))
        read_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id=ctg_id))

        # outputs
        wd = os.path.join(
            os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id=ctg_id))

        blasr_dir = os.path.join(wd, 'blasr')
        ctg_aln_out = makePypeLocalFile(
            os.path.join(blasr_dir,
                         '{ctg_id}_sorted.bam'.format(ctg_id=ctg_id)))

        phasing_dir = os.path.join(wd, 'phasing')
        job_done = makePypeLocalFile(
            os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id=ctg_id)))
        rid_to_phase_out = makePypeLocalFile(
            os.path.join(
                wd,
                'rid_to_phase.{ctg_id}'.format(ctg_id=ctg_id)))  # TODO: ???
        all_ctg_out['r2p.{ctg_id}'.format(
            ctg_id=ctg_id)] = rid_to_phase_out  # implicit output?

        parameters = {
            'job_uid': 'ha-' + ctg_id,
            'wd': wd,
            'config': config,
            'ctg_id': ctg_id,
            'sge_option': config['sge_phasing'],
        }
        make_phasing_task = PypeTask(
            inputs={
                'ref_fasta': ref_fasta,
                'aln_bam': ctg_aln_out
            },
            outputs={'job_done': job_done},
            parameters=parameters,
        )
        phasing_task = make_phasing_task(task_phasing)
        wf.addTask(phasing_task)
    wf.refreshTargets()

    hasm_wd = os.path.abspath('./3-unzip/1-hasm/')
    #mkdir(hasm_wd)
    rid_to_phase_all = makePypeLocalFile(
        os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all'))
    task = PypeTask(
        inputs=all_ctg_out,
        outputs={'rid_to_phase_all': rid_to_phase_all},
    )(get_rid_to_phase_all)
    wf.addTask(task)

    parameters['wd'] = hasm_wd
    parameters['sge_option'] = config['sge_hasm']
    job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done'))
    make_hasm_task = PypeTask(
        inputs={'rid_to_phase_all': rid_to_phase_all},
        outputs={'job_done': job_done},
        parameters=parameters,
    )
    hasm_task = make_hasm_task(task_hasm)

    wf.addTask(hasm_task)

    wf.refreshTargets()
Пример #36
0
def run(
    wf,
    config,
    input_config_fn,
    input_fofn_plf,
):
    """
    Preconditions (for now):
    * fc_run_logger
    * run_support.logger
    """
    rawread_dir = os.path.abspath('./0-rawreads')
    pread_dir = os.path.abspath('./1-preads_ovl')
    falcon_asm_dir = os.path.abspath('./2-asm-falcon')
    script_dir = os.path.abspath('./scripts')
    sge_log_dir = os.path.abspath('./sge_log')

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    exitOnFailure = config[
        'stop_all_jobs_on_failure']  # only matter for parallel jobs
    wf.max_jobs = config['default_concurrent_jobs']

    rawread_fofn_plf = makePypeLocalFile(
        os.path.join(rawread_dir, 'raw-fofn-abs',
                     os.path.basename(config['input_fofn'])))
    make_fofn_abs_task = PypeTask(
        inputs={'i_fofn': input_fofn_plf},
        outputs={'o_fofn': rawread_fofn_plf},
        parameters={},
    )
    fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config['input_type'] == 'raw':
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile(os.path.join(rawread_dir, 'sleep_done'))
        rdb_build_done = makePypeLocalFile(
            os.path.join(rawread_dir, 'rdb_build_done'))
        run_jobs = makePypeLocalFile(os.path.join(rawread_dir, 'run_jobs.sh'))
        parameters = {
            'work_dir': rawread_dir,
            'sge_option': config['sge_option_da'],
            'config_fn': input_config_fn,
            'config': config
        }

        length_cutoff_plf = makePypeLocalFile(
            os.path.join(rawread_dir, 'length_cutoff'))
        raw_reads_db_plf = makePypeLocalFile(
            os.path.join(rawread_dir, '%s.db' % 'raw_reads'))
        make_build_rdb_task = PypeTask(
            inputs={'input_fofn': rawread_fofn_plf},
            outputs={
                'rdb_build_done': rdb_build_done,
                'raw_reads_db': raw_reads_db_plf,
                'length_cutoff': length_cutoff_plf,
                'run_jobs': run_jobs,
            },
            parameters=parameters,
        )
        build_rdb_task = make_build_rdb_task(pype_tasks.task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done])

        raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf))
        #### run daligner
        wf.max_jobs = config['da_concurrent_jobs']
        scattered_plf = os.path.join(rawread_dir, 'daligner-scatter',
                                     'scattered.json')
        make_daligner_scatter = PypeTask(
            inputs={
                'run_jobs_fn': run_jobs,
                'db_build_done': rdb_build_done,
            },
            outputs={
                'scatter_fn': scattered_plf,
            },
            parameters={
                'db_prefix': 'raw_reads',
                'nblock': raw_reads_nblock,
                'pread_aln': False,
                'config': config,
            },
        )
        task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        daligner_tasks, daligner_out = create_daligner_tasks(
            rawread_dir, scattered_plf)

        wf.addTasks(daligner_tasks)
        r_gathered_las_plf = makePypeLocalFile(
            os.path.join(rawread_dir, 'raw-gather', 'gathered_las.txt'))

        parameters = {
            'nblock': raw_reads_nblock,
        }
        make_daligner_gather = PypeTask(
            inputs=daligner_out,
            outputs={'gathered': r_gathered_las_plf},
            parameters=parameters,
        )
        check_r_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
        wf.addTask(check_r_da_task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        # Merge .las files.
        wf.max_jobs = config['la_concurrent_jobs']
        scattered_plf = os.path.join(rawread_dir, 'merge-scatter',
                                     'scattered.json')
        make_task = PypeTask(
            inputs={
                'run_jobs': run_jobs,
                'gathered_las': r_gathered_las_plf,
            },
            outputs={
                'scattered': scattered_plf,
            },
            parameters={
                'db_prefix': 'raw_reads',
                'config': config,
            },
        )
        task = make_task(pype_tasks.task_merge_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        merge_tasks, p_ids_merged_las = create_merge_tasks(
            rawread_dir, scattered_plf)
        wf.addTasks(merge_tasks)
        task, _, las_fopfn_plf = create_merge_gather_task(
            os.path.join(rawread_dir, 'merge-gather'), p_ids_merged_las)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        if config['target'] == 'overlapping':
            sys.exit(0)

        # Produce new FOFN of preads fasta, based on consensus of overlaps.
        wf.max_jobs = config['cns_concurrent_jobs']

        scattered_plf = os.path.join(rawread_dir, 'cns-scatter',
                                     'scattered.json')
        make_task = PypeTask(
            inputs={
                'gathered': las_fopfn_plf,
                'db': raw_reads_db_plf,
            },
            outputs={
                'scattered': scattered_plf,
            },
            parameters={
                'db_prefix': 'raw_reads',
                'config': config,
            },
        )
        task = make_task(pype_tasks.task_consensus_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        tasks, consensus_out = create_consensus_tasks(rawread_dir,
                                                      scattered_plf)
        wf.addTasks(tasks)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        task, preads_fofn_plf = create_consensus_gather_task(
            os.path.join(rawread_dir, 'preads'), consensus_out)
        wf.addTask(task)

        rdir = os.path.join(rawread_dir, 'report')
        pre_assembly_report_plf = makePypeLocalFile(
            os.path.join(rdir, 'pre_assembly_stats.json'))
        parameters = dict(config)
        parameters['cwd'] = rdir
        make_task = PypeTask(
            inputs={
                'length_cutoff_fn': length_cutoff_plf,
                'raw_reads_db': raw_reads_db_plf,
                'preads_fofn': preads_fofn_plf,
            },
            outputs={
                'pre_assembly_report': pre_assembly_report_plf,
            },
            parameters=parameters,
        )
        task = make_task(pype_tasks.task_report_pre_assembly)
        wf.addTask(task)

        wf.refreshTargets(exitOnFailure=exitOnFailure)

    if config['target'] == 'pre-assembly':
        log.info('Quitting after stage-0 for "pre-assembly" target.')
        sys.exit(0)

    # build pread database
    if config['input_type'] == 'preads':
        preads_fofn_plf = makePypeLocalFile(
            os.path.join(pread_dir, 'preads-fofn-abs',
                         os.path.basename(config['input_fofn'])))
        make_fofn_abs_task = PypeTask(
            inputs={'i_fofn': rawread_fofn_plf},
            outputs={'o_fofn': preads_fofn_plf},
            parameters={},
        )
        fofn_abs_task = make_fofn_abs_task(
            pype_tasks.task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile(
        os.path.join(pread_dir, 'pdb_build_done'))
    parameters = {
        'work_dir': pread_dir,
        'sge_option': config['sge_option_pda'],
        'config_fn': input_config_fn,
        'config': config
    }

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    preads_db = makePypeLocalFile(os.path.join(
        pread_dir, 'preads.db'))  # Also .preads.*, of course.
    make_build_pdb_task = PypeTask(
        inputs={'preads_fofn': preads_fofn_plf},
        outputs={
            'pdb_build_done': pdb_build_done,
            'preads_db': preads_db,
            'run_jobs': run_jobs,
        },
        parameters=parameters,
    )
    build_pdb_task = make_build_pdb_task(pype_tasks.task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done])

    preads_nblock = support.get_nblock(fn(preads_db))
    #### run daligner
    wf.max_jobs = config['pda_concurrent_jobs']
    config['sge_option_da'] = config['sge_option_pda']

    scattered_plf = os.path.join(pread_dir, 'daligner-scatter',
                                 'scattered.json')
    make_daligner_scatter = PypeTask(
        inputs={
            'run_jobs_fn': run_jobs,
            'db_build_done': pdb_build_done,
        },
        outputs={
            'scatter_fn': scattered_plf,
        },
        parameters={
            'db_prefix': 'preads',
            'nblock': preads_nblock,
            'pread_aln': True,
            'config': config,
        },
    )
    task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
    wf.addTask(task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    daligner_tasks, daligner_out = create_daligner_tasks(
        pread_dir, scattered_plf)
    wf.addTasks(daligner_tasks)

    p_gathered_las_plf = makePypeLocalFile(
        os.path.join(pread_dir, 'gathered-las', 'gathered-las.txt'))
    parameters = {
        'nblock': preads_nblock,
    }
    make_daligner_gather = PypeTask(
        inputs=daligner_out,
        outputs={'gathered': p_gathered_las_plf},
        parameters=parameters,
    )
    check_p_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
    wf.addTask(check_p_da_task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    # Merge .las files.
    wf.max_jobs = config['pla_concurrent_jobs']
    config['sge_option_la'] = config['sge_option_pla']
    scattered_plf = os.path.join(pread_dir, 'merge-scatter', 'scattered.json')
    make_task = PypeTask(
        inputs={
            'run_jobs': run_jobs,
            'gathered_las': p_gathered_las_plf,
        },
        outputs={
            'scattered': scattered_plf,
        },
        parameters={
            'db_prefix': 'preads',
            'config': config,
        },
    )
    task = make_task(pype_tasks.task_merge_scatter)
    wf.addTask(task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    merge_tasks, p_ids_merged_las = create_merge_tasks(pread_dir,
                                                       scattered_plf)
    wf.addTasks(merge_tasks)
    task, las_fofn_plf, las_fopfn_plf = create_merge_gather_task(
        os.path.join(pread_dir, 'merge-gather'), p_ids_merged_las)
    wf.addTask(task)

    wf.refreshTargets(exitOnFailure=exitOnFailure)

    # Draft assembly (called 'fc_' for now)
    wf.max_jobs = config['fc_concurrent_jobs']
    db2falcon_dir = os.path.join(pread_dir, 'db2falcon')
    db2falcon_done = makePypeLocalFile(
        os.path.join(db2falcon_dir, 'db2falcon_done'))
    preads4falcon_plf = makePypeLocalFile(
        os.path.join(db2falcon_dir, 'preads4falcon.fasta'))
    make_run_db2falcon = PypeTask(
        inputs={
            'las_fofn_plf': las_fofn_plf,
            'preads_db': preads_db,
        },
        outputs={
            'db2falcon_done': db2falcon_done,
            'preads4falcon': preads4falcon_plf,
        },
        parameters={
            'wd': db2falcon_dir,
            'config': config,
            'sge_option': config['sge_option_fc'],
        },
    )
    wf.addTask(make_run_db2falcon(pype_tasks.task_run_db2falcon))

    falcon_asm_done = makePypeLocalFile(
        os.path.join(falcon_asm_dir, 'falcon_asm_done'))
    make_run_falcon_asm = PypeTask(
        inputs={
            'db2falcon_done': db2falcon_done,
            'db_file': preads_db,
            'preads4falcon': preads4falcon_plf,
            'las_fofn': las_fofn_plf,
        },
        outputs={'falcon_asm_done': falcon_asm_done},
        parameters={
            'wd': falcon_asm_dir,
            'config': config,
            'pread_dir': pread_dir,
            'sge_option': config['sge_option_fc'],
        },
    )
    wf.addTask(make_run_falcon_asm(pype_tasks.task_run_falcon_asm))
    wf.refreshTargets()

    return falcon_asm_done
Пример #37
0
def phasing(args):
    bam_fn = args.bam
    fasta_fn = args.fasta
    ctg_id = args.ctg_id
    base_dir = args.base_dir
    samtools = args.samtools

    ref_seq = ""
    for r in FastaReader(fasta_fn):
        rid = r.name.split()[0]
        if rid != ctg_id:
            continue
        ref_seq = r.sequence.upper()

    wf = PypeProcWatcherWorkflow(
            max_jobs=1,
    )

    bam_file = makePypeLocalFile(bam_fn)
    vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") )
    vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") )
    q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") )
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["ref_seq"] = ref_seq
    parameters["base_dir"] = base_dir
    parameters["samtools"] = samtools

    make_het_call_task = PypeTask( inputs = { "bam_file": bam_file },
                         outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file },
                         parameters = parameters,
    ) (make_het_call)

    wf.addTasks([make_het_call_task])




    atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") )
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["base_dir"] = base_dir
    generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file },
                                      outputs = { "atable_file": atable_file },
                                      parameters = parameters,
    ) (generate_association_table)

    wf.addTasks([generate_association_table_task])




    phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") )
    get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file },
                                      outputs = { "phased_variant_file": phased_variant_file },
    ) (get_phased_blocks)
    wf.addTasks([get_phased_blocks_task])




    phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") )
    get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file,
                                                 "q_id_map_file": q_id_map_file,
                                                 "phased_variant_file": phased_variant_file },
                                      outputs = { "phased_read_file": phased_read_file },
                                      parameters = {"ctg_id": ctg_id},
    ) (get_phased_reads)
    wf.addTasks([get_phased_reads_task])


    wf.refreshTargets()
Пример #38
0
def main(argv=sys.argv):
    global LOG
    LOG = support.setup_logger(None)

    if len(sys.argv) < 2:
        print >> sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
        sys.exit(1)

    config_fn = sys.argv[1]
    config_absbasedir = os.path.dirname(os.path.abspath(config_fn))

    config = ConfigParser.ConfigParser()
    config.read(config_fn)

    job_type = 'SGE'
    if config.has_option('General', 'job_type'):
        job_type = config.get('General', 'job_type')

    sge_track_reads = ' -pe smp 12 -q bigmem'
    if config.has_option('Unzip', 'sge_track_reads'):
        sge_track_reads = config.get('Unzip', 'sge_track_reads')

    sge_quiver = ' -pe smp 24 -q bigmem '
    if config.has_option('Unzip', 'sge_quiver'):
        sge_quiver = config.get('Unzip', 'sge_quiver')

    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
    if config.has_option('Unzip', 'smrt_bin'):
        smrt_bin = config.get('Unzip', 'smrt_bin')

    input_bam_fofn = 'input_bam.fofn'
    if config.has_option('Unzip', 'input_bam_fofn'):
        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
    if not os.path.isabs(input_bam_fofn):
        input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)

    quiver_concurrent_jobs = 8
    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
        quiver_concurrent_jobs = config.getint('Unzip',
                                               'quiver_concurrent_jobs')

    config = {
        'job_type': job_type,
        'sge_quiver': sge_quiver,
        'sge_track_reads': sge_track_reads,
        'input_bam_fofn': input_bam_fofn,
        'smrt_bin': smrt_bin
    }
    LOG.info('config={}'.format(pprint.pformat(config)))

    #support.job_type = 'SGE' #tmp hack until we have a configuration parser

    wf = PypeProcWatcherWorkflow(max_jobs=quiver_concurrent_jobs, )

    abscwd = os.path.abspath('.')
    parameters = {
        'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'),
        'config': config
    }
    hasm_done_plf = makePypeLocalFile(
        './3-unzip/1-hasm/hasm_done')  # by convention
    track_reads_h_done_plf = makePypeLocalFile(
        os.path.join(parameters['wd'], 'track_reads_h_done'))
    make_track_reads_task = PypeTask(
        inputs={'hasm_done': hasm_done_plf},
        outputs={'job_done': track_reads_h_done_plf},
        parameters=parameters,
    )
    track_reads_task = make_track_reads_task(task_track_reads)
    #sge_track_reads = config['sge_track_reads']

    wf.addTask(track_reads_task)

    scattered_quiver_plf = makePypeLocalFile(
        '4-quiver/quiver_scatter/scattered.json')
    make_task = PypeTask(
        inputs={
            'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
            'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
            'track_reads_h_done': track_reads_h_done_plf,
        },
        outputs={
            'scattered_quiver_json': scattered_quiver_plf,
        },
        parameters={},
    )
    wf.addTask(make_task(task_scatter_quiver))
    wf.refreshTargets()

    p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(
        scattered_quiver_plf)

    gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
    gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
    gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
    mkdir('4-quiver/cns_gather')
    with open(fn(gathered_p_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
    with open(fn(gathered_h_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))

    make_task = PypeTask(
        inputs=job_done_plfs,
        outputs={
            'job_done': gather_done_plf,
        },
        parameters={},
    )
    wf.addTask(make_task(task_gather_quiver))
    wf.refreshTargets()

    cns_p_ctg_fasta_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_p_ctg.fasta')
    cns_p_ctg_fastq_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_p_ctg.fastq')
    cns_h_ctg_fasta_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_h_ctg.fasta')
    cns_h_ctg_fastq_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_h_ctg.fastq')
    zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
    make_task = PypeTask(
        inputs={
            'gathered_p_ctg': gathered_p_ctg_plf,
            'gathered_h_ctg': gathered_h_ctg_plf,
            'gather_done': gather_done_plf,
        },
        outputs={
            'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
            'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
            'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
            'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
            'job_done': zcat_done_plf,
        },
    )
    wf.addTask(make_task(task_cns_zcat))

    wf.refreshTargets()