示例#1
0
def task_phasing(self):
    ref_fasta = fn(self.ref_fasta)
    aln_bam = fn(self.aln_bam)

    job_done = fn(self.job_done)

    job_uid = self.parameters['job_uid']
    wd = self.parameters['wd']
    ctg_id = self.parameters['ctg_id']

    config = self.parameters['config']
    smrt_bin = config['smrt_bin']
    samtools = os.path.join(smrt_bin, 'samtools')

    script_fn = os.path.join(wd, 'p_%s.sh' % (ctg_id))

    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
cd {wd}
hostname
date
cd {wd}
fc_phasing.py --bam {aln_bam} --fasta {ref_fasta} --ctg_id {ctg_id} --base_dir .. --samtools {samtools}
fc_phasing_readmap.py --ctg_id {ctg_id} --read_map_dir ../../../2-asm-falcon/read_maps --phased_reads phased_reads
date
touch {job_done}
""".format(**locals())

    with open(script_fn, 'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#2
0
def task_phasing(self):
    ref_fasta = fn(self.ref_fasta)
    aln_bam = fn(self.aln_bam)

    job_done = fn(self.job_done)

    job_uid = self.parameters['job_uid']
    wd = self.parameters['wd']
    ctg_id = self.parameters['ctg_id']

    config = self.parameters['config']
    smrt_bin = config['smrt_bin']
    samtools = os.path.join(smrt_bin, 'samtools')

    script_fn = os.path.join(wd , 'p_%s.sh' % (ctg_id))

    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
cd {wd}
hostname
date
cd {wd}
fc_phasing.py --bam {aln_bam} --fasta {ref_fasta} --ctg_id {ctg_id} --base_dir .. --samtools {samtools}
fc_phasing_readmap.py --ctg_id {ctg_id} --read_map_dir ../../../2-asm-falcon/read_maps --phased_reads phased_reads
date
touch {job_done}
""".format(**locals())

    with open(script_fn,'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#3
0
def create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters):
    tasks = list()
    next_inputs = dict()
    topdir = os.path.join(os.path.dirname(fn(split_subreadsets_fofn_pfn)), 'run-fastas2fofn')
    # Create the fastas in parallel.
    for i, chunk_fn in enumerate(open(fn(split_subreadsets_fofn_pfn)).read().splitlines()):
        wdir = os.path.join(topdir, 'fasta_job_{:03d}'.format(i)) # TODO: 02
        chunk_pfn = makePypeLocalFile(os.path.join(wdir, chunk_fn))
        fasta_done_fn = os.path.join(wdir, 'chunk_{:03d}_done'.format(i)) # TODO: 02
        # By depending on a sentinel, we are allowed to delete fastas later.
        # Note: i might not match num in chunk_fn, but that is ok
        fasta_done_pfn = makePypeLocalFile(fasta_done_fn)
        make_task = PypeTask(
                inputs = {"dataset": chunk_pfn, },
                outputs =  {"fasta_done": fasta_done_pfn, },
                parameters = parameters,
        )
        task = make_task(start_task.task_bam2fasta_dexta)
        tasks.append(task)
        next_inputs['fasta_{}_done'.format(i)] = fasta_done_pfn
        #fasta_fn = base_from_done(fasta_done_fn) + '.fasta'  # By convention.
    # Create the FOFN of fastas.
    fasta_fofn_fn = os.path.join(topdir, 'fasta.fofn')
    fasta_fofn_pfn = makePypeLocalFile(fasta_fofn_fn)
    make_task = PypeTask(
            inputs = next_inputs,
            outputs =  {"fofn": fasta_fofn_pfn,
            },
            parameters = parameters,
    )
    task = make_task(start_task.task_fastas2fofn)
    tasks.append(task)
    return tasks, fasta_fofn_pfn
示例#4
0
def task_scatter_quiver(self):
    p_ctg_fn = fn(self.p_ctg_fa)
    h_ctg_fn = fn(self.h_ctg_fa)
    out_json = fn(self.scattered_quiver_json)
    track_reads_h_done_fn = fn(self.track_reads_h_done)
    bam_dir = os.path.dirname(track_reads_h_done_fn)
    config = self.parameters['config']

    ref_seq_data = {}

    # I think this will crash if the file is empty. Maybe that is ok.
    p_ctg_fa = FastaReader(p_ctg_fn)
    ctg_types = {}
    for r in p_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = 'p'


    # I think this will crash if the file is empty. Maybe that is ok.
    h_ctg_fa = FastaReader(h_ctg_fn)
    for r in h_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = 'h'

    ctg_ids = sorted(ref_seq_data.keys())
    #p_ctg_out=[]
    #h_ctg_out=[]
    #job_done_plfs = {}
    jobs = []
    for ctg_id in ctg_ids:
        sequence = ref_seq_data[ctg_id]
        m_ctg_id = ctg_id.split('-')[0]
        wd = os.path.join(os.getcwd(), m_ctg_id)
        ref_fasta = os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
        read_bam = os.path.join(bam_dir, '{ctg_id}.bam'.format(ctg_id = ctg_id))
        #cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
        #cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
        #job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))

        if os.path.exists(read_bam):
            # *.sam are created in task_track_reads, fc_select_reads_from_bam.py
            # Network latency should not matter because we have already waited for the 'done' file.
            mkdir(wd)
            if not os.path.exists(ref_fasta):
                # TODO(CD): Up to 50MB of seq data. Should do this on remote host.
                #   See https://github.com/PacificBiosciences/FALCON_unzip/issues/59
                with open(ref_fasta,'w') as f:
                    print >>f, '>'+ctg_id
                    print >>f, sequence
            new_job = {}
            new_job['ctg_id'] = ctg_id
            new_job['ctg_types'] = ctg_types
            new_job['smrt_bin'] = config['smrt_bin']
            new_job['sge_option'] = config['sge_quiver']
            new_job['ref_fasta'] = ref_fasta
            new_job['read_bam'] = read_bam
            jobs.append(new_job)
    open(out_json, 'w').write(json.dumps(jobs))
示例#5
0
def task_track_reads(self):
    input_bam_fofn = fn(self.input_bam_fofn)
    job_done = fn(self.job_done)
    work_dir = os.getcwd()
    basedir = '../..' # assuming we are in ./4-quiver/reads/
    script_fn = 'track_reads_h.sh'

    # For now, in/outputs are in various directories, by convention.
    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
hostname
date
cd {basedir}
fc_get_read_hctg_map.py
fc_rr_hctg_track.py
fc_select_reads_from_bam.py {input_bam_fofn}
date
cd {work_dir}
touch {job_done}
""".format(**locals())

    with open(script_fn,'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#6
0
def task_hasm(self):
    rid_to_phase_all = fn(self.rid_to_phase_all)
    job_done = fn(self.job_done)
    config = self.parameters['config']
    sge_hasm = config['sge_hasm']

    wd = self.parameters['wd']

    job_type = config['job_type']

    script_dir = os.path.join(wd)
    script_fn = os.path.join(script_dir, 'hasm.sh')

    las_fofn = '../../2-asm-falcon/las.fofn'
    las_fofn = '../../1-preads_ovl/merge-gather/las.fofn'
    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
hostname
date
cd {wd}

fc_ovlp_filter_with_phase.py --fofn {las_fofn} --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map {rid_to_phase_all} > preads.p_ovl
fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log
if [ -e ../../1-preads_ovl/preads4falcon.fasta ];
then
  ln -sf ../../1-preads_ovl/preads4falcon.fasta .
else
  ln -sf ../../1-preads_ovl/db2falcon/preads4falcon.fasta .
fi
fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map {rid_to_phase_all} --fasta preads4falcon.fasta

# more script -- a little bit hacky here, we should improve

WD=$PWD
for f in `cat ../reads/ctg_list `; do mkdir -p $WD/$f; cd $WD/$f; fc_dedup_h_tigs.py $f; done

## prepare for quviering the haplotig
cd $WD/..
if [ -e "all_phased_reads" ]; then rm all_phased_reads; fi
if [ -e "all_h_ctg_ids" ]; then rm all_h_ctg_ids; fi
if [ -e "all_p_ctg_edges" ]; then rm all_p_ctg_edges; fi
if [ -e "all_p_ctg.fa" ]; then rm all_p_ctg.fa; fi
if [ -e "all_h_ctg.fa" ]; then rm all_h_ctg.fa; fi

find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads
find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids
find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges
find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges
find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa
find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa
cd ../
date
touch {job_done}
""".format(**locals())

    with open(script_fn, 'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#7
0
def get_phased_reads(self):

    q_id_map_fn = fn(self.q_id_map_file)
    vmap_fn = fn(self.vmap_file)
    p_variant_fn = fn(self.phased_variant_file)
    parameters = self.parameters

    ctg_id = parameters["ctg_id"]

    phased_read_fn = fn(self.phased_read_file)

    rid_map = {}
    with open(q_id_map_fn) as f:
        for l in f:
            l = l.strip().split()
            rid_map[int(l[0])] = l[1]


    read_to_variants = {}
    variant_to_reads = {}
    with open(vmap_fn) as f:
        for l in f:
            l = l.strip().split()
            variant = "_".join(l[:3])
            read_id = int(l[3])
            read_to_variants.setdefault(read_id, set())
            read_to_variants[read_id].add(variant)
            variant_to_reads.setdefault(variant, set())
            variant_to_reads[variant].add(read_id)


    variant_to_phase = {}
    with open(p_variant_fn) as f:
        for l in f:
            """line format example: V 1 6854 6854_A_A 6854_A_G 6854 22781"""
            l = l.strip().split()
            if l[0] != "V":
                continue
            pb_id = int(l[1])
            variant_to_phase[ l[3] ] = (pb_id, 0)
            variant_to_phase[ l[4] ] = (pb_id, 1)

    with open(phased_read_fn, "w") as out_f:
        for r in read_to_variants:
            vl = {}
            pl = set()
            for v in list( read_to_variants[r] ):
                if v in variant_to_phase:
                    p = variant_to_phase[v]
                    vl[ p ] = vl.get(p, 0) + 1
                    pl.add(p[0])
            pl = list(pl)
            pl.sort()
            for p in pl:
                if vl.get( (p,0), 0) - vl.get( (p,1), 0) > 1:
                    print >> out_f, r, ctg_id, p, 0, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
                elif vl.get( (p,1), 0) - vl.get( (p,0), 0) > 1:
                    print >> out_f, r, ctg_id, p, 1, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
示例#8
0
def task_hasm(self):
    rid_to_phase_all = fn(self.rid_to_phase_all)
    job_done = fn(self.job_done)
    #config = self.parameters['config']

    wd = self.parameters['wd']
    script_fn =  os.path.join(wd , 'hasm.sh')

    las_fofn = '../../2-asm-falcon/las.fofn'
    las_fofn = '../../1-preads_ovl/merge-gather/las.fofn'
    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
hostname
date
cd {wd}

fc_ovlp_filter_with_phase.py --fofn {las_fofn} --max_diff 120 --max_cov 120 --min_cov 1 --n_core 48 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map {rid_to_phase_all} > preads.p_ovl
fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log
if [ -e ../../1-preads_ovl/preads4falcon.fasta ];
then
  ln -sf ../../1-preads_ovl/preads4falcon.fasta .
else
  ln -sf ../../1-preads_ovl/db2falcon/preads4falcon.fasta .
fi
fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map {rid_to_phase_all} --fasta preads4falcon.fasta

# more script -- a little bit hacky here, we should improve

WD=$PWD
for f in `cat ../reads/ctg_list `; do mkdir -p $WD/$f; cd $WD/$f; fc_dedup_h_tigs.py $f; done

## prepare for quviering the haplotig
cd $WD/..
if [ -e "all_phased_reads" ]; then rm all_phased_reads; fi
if [ -e "all_h_ctg_ids" ]; then rm all_h_ctg_ids; fi
if [ -e "all_p_ctg_edges" ]; then rm all_p_ctg_edges; fi
if [ -e "all_p_ctg.fa" ]; then rm all_p_ctg.fa; fi
if [ -e "all_h_ctg.fa" ]; then rm all_h_ctg.fa; fi

find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads
find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids
find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges
find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges
find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa
find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa
cd ../
date
touch {job_done}
""".format(**locals())

    with open(script_fn,'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#9
0
def taskA(self):
    i1 = fn(self.i1)
    o1 = fn(self.o1)
    script = """
set -vex
cat {i1} > {o1}
""".format(**locals())
    script_fn = 'script.sh'
    with open(script_fn, 'w') as ofs:
        ofs.write(script)
    self.generated_script_fn = script_fn
示例#10
0
def create_tasks_pbalign(chunk_json_pfn, referenceset_pfn, parameters):
    """Create a pbalign task for each chunk, plus a gathering task.
    """
    tasks = list()
    gathering = dict()
    chunk_dir = os.path.dirname(fn(chunk_json_pfn))
    for i, subreadset_fn in enumerate(
            sorted(
                yield_pipeline_chunk_names_from_json(open(fn(chunk_json_pfn)),
                                                     '$chunk.subreadset_id'))):
        wdir = 'run-pbalign-{:02d}'.format(i)
        subreadset_fn = os.path.join(chunk_dir,
                                     os.path.basename(subreadset_fn))
        subreadset_pfn = makePypeLocalFile(subreadset_fn)
        unmapped_pfn = makePypeLocalFile(
            '{wdir}/unmapped.txt'.format(**locals()))
        alignmentset_pfn = makePypeLocalFile(
            '{wdir}/align.subreads.{i:02d}.alignmentset.xml'.format(
                **locals()))
        gathering['unmapped_{:02d}'.format(i)] = unmapped_pfn
        gathering['alignmentsets_{:02d}'.format(i)] = alignmentset_pfn
        """Also produces:
        aligned.subreads.i.alignmentset.bam
        aligned.subreads.i.alignmentset.bam.bai
        aligned.subreads.i.alignmentset.bam.pbi
        """
        make_task = PypeTask(
            inputs={
                "chunk_json": chunk_json_pfn,
                "dataset": subreadset_pfn,
                "referenceset": referenceset_pfn,
            },
            outputs={
                "alignmentset": alignmentset_pfn,
                "unmapped": unmapped_pfn,
            },
            parameters=parameters,
        )
        task = make_task(start_task.task_pbalign)
        tasks.append(task)
    o_alignmentset_pfn = makePypeLocalFile(
        'run-pbalign_gather/aligned.subreads.alignmentset.xml')
    o_unmapped_pfn = makePypeLocalFile('run-pbalign_gather/unmapped.txt')
    make_task = PypeTask(
        inputs=gathering,
        outputs={
            "o_ds": o_alignmentset_pfn,
            "o_unmapped": o_unmapped_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_pbalign_gather)
    tasks.append(task)
    return tasks, alignmentset_pfn
示例#11
0
def get_rid_to_phase_all(self):
    # Tasks must be at module scope now.
    rid_to_phase_all_fn = fn(self.rid_to_phase_all)
    inputs_fn = [fn(f) for f in self.inputs.values()]
    inputs_fn.sort()
    output = []
    for fname in inputs_fn:
        output.extend(open(fname).read())

    out = open(rid_to_phase_all_fn, 'w')
    out.write(''.join(output))
    out.close()
示例#12
0
def get_rid_to_phase_all(self):
    # Tasks must be at module scope now.
    rid_to_phase_all_fn = fn(self.rid_to_phase_all)
    inputs_fn = [ fn(f) for f in self.inputs.values() ]
    inputs_fn.sort()
    output = []
    for fname in inputs_fn:
        output.extend(open(fname).read())

    out = open(rid_to_phase_all_fn, 'w')
    out.write(''.join(output))
    out.close()
示例#13
0
def say_hey1(self):
    o1 = fn(self.o1)
    i0 = fn(self.i0)
    script = """\
#!/bin/bash

echo hey1
touch %(o1)s
""" % locals()
    script_fn = 'run-hey.sh'
    with open(script_fn, 'w') as ofs:
        ofs.write(script)
    self.generated_script_fn = script_fn
示例#14
0
def task_run_quiver(self):

    ref_fasta = fn(self.ref_fasta)
    read_sam = fn(self.read_sam)

    cns_fasta = fn(self.cns_fasta)
    cns_fastq = fn(self.cns_fastq)
    job_done = fn(self.job_done)

    job_uid = self.parameters['job_uid']
    wd = self.parameters['wd']
    config = self.parameters['config']
    ctg_id = self.parameters['ctg_id']

    smrt_bin = config['smrt_bin']
    sge_quiver = config['sge_quiver']
    job_type = config['job_type']
    samtools = os.path.join(smrt_bin, 'samtools')
    pbalign = os.path.join(smrt_bin, 'pbalign')
    makePbi = os.path.join(smrt_bin, 'makePbi')
    variantCaller = os.path.join(smrt_bin, 'variantCaller')

    script_dir = os.path.join(wd)
    script_fn = os.path.join(script_dir, 'cns_%s.sh' % (ctg_id))

    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
hostname
date
cd {wd}

{samtools} faidx {ref_fasta}
{samtools} view -b -S {read_sam} > {ctg_id}.bam
{pbalign} --tmpDir=/localdisk/scratch/ --nproc=24 --minAccuracy=0.75 --minLength=50\
          --minAnchorSize=12 --maxDivergence=30 --concordant --algorithm=blasr\
          --algorithmOptions=-useQuality --maxHits=1 --hitPolicy=random --seed=1\
            {ctg_id}.bam {ref_fasta} aln-{ctg_id}.bam
#{makePbi} --referenceFasta {ref_fasta} aln-{ctg_id}.bam
({variantCaller} -x 5 -X 120 -q 20 -j 24 -r {ref_fasta} aln-{ctg_id}.bam\
            -o {cns_fasta} -o {cns_fastq}) || echo quvier failed
date
touch {job_done}
""".format(**locals())

    with open(script_fn, 'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#15
0
def task_track_reads(self):
    job_done = fn(self.job_done)
    wd = self.parameters['wd']
    #config = self.parameters['config']
    script_fn = os.path.join(wd, 'track_reads.sh')
    topdir = '../..'

    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
hostname
date
cd {topdir}
python -m falcon_kit.mains.get_read_ctg_map
python -m falcon_kit.mains.rr_ctg_track
python -m falcon_kit.mains.pr_ctg_track
#mkdir -p 3-unzip/reads/
python -m falcon_kit.mains.fetch_reads
cd {wd}
date
touch {job_done}
""".format(**locals())

    with open(script_fn, 'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#16
0
def task_track_reads(self):
    job_done = fn(self.job_done)
    wd = self.parameters['wd']
    config = self.parameters['config']
    input_bam_fofn = config['input_bam_fofn']
    sge_track_reads = config['sge_track_reads']
    script_dir = os.path.join(wd)
    script_fn = os.path.join(script_dir, 'track_reads_h.sh')

    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
cd {wd}
hostname
date
fc_get_read_hctg_map.py --basedir ../..
fc_rr_hctg_track.py --base_dir ../..
mkdir -p 4-quiver/reads/
fc_select_reads_from_bam.py --basedir ../.. {input_bam_fofn}
date
touch {job_done}
""".format(**locals())

    with open(script_fn, 'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#17
0
def touchit(self):
    out = fn(self.out)
    s = 1
    LOG.info('sleep {}'.format(s))
    time.sleep(s)
    cmd = 'touch {}'.format(out)
    system(cmd)
示例#18
0
def task_track_reads(self):
    job_done = fn(self.job_done)
    wd = self.parameters['wd']
    #config = self.parameters['config']
    script_fn = os.path.join(wd , 'track_reads.sh')
    topdir = '../..'

    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
hostname
date
cd {topdir}
python -m falcon_kit.mains.get_read_ctg_map
python -m falcon_kit.mains.rr_ctg_track
python -m falcon_kit.mains.pr_ctg_track
#mkdir -p 3-unzip/reads/
python -m falcon_kit.mains.fetch_reads
cd {wd}
date
touch {job_done}
""".format(**locals())

    with open(script_fn,'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#19
0
def create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters):
    tasks = list()
    next_inputs = dict()
    topdir = os.path.join(os.path.dirname(fn(split_subreadsets_fofn_pfn)),
                          'run-fastas2fofn')
    # Create the fastas in parallel.
    for i, chunk_fn in enumerate(
            open(fn(split_subreadsets_fofn_pfn)).read().splitlines()):
        wdir = os.path.join(topdir, 'fasta_job_{:03d}'.format(i))  # TODO: 02
        chunk_pfn = makePypeLocalFile(os.path.join(wdir, chunk_fn))
        fasta_done_fn = os.path.join(wdir,
                                     'chunk_{:03d}_done'.format(i))  # TODO: 02
        # By depending on a sentinel, we are allowed to delete fastas later.
        # Note: i might not match num in chunk_fn, but that is ok
        fasta_done_pfn = makePypeLocalFile(fasta_done_fn)
        make_task = PypeTask(
            inputs={
                "dataset": chunk_pfn,
            },
            outputs={
                "fasta_done": fasta_done_pfn,
            },
            parameters=parameters,
        )
        task = make_task(start_task.task_bam2fasta_dexta)
        tasks.append(task)
        next_inputs['fasta_{}_done'.format(i)] = fasta_done_pfn
        #fasta_fn = base_from_done(fasta_done_fn) + '.fasta'  # By convention.
    # Create the FOFN of fastas.
    fasta_fofn_fn = os.path.join(topdir, 'fasta.fofn')
    fasta_fofn_pfn = makePypeLocalFile(fasta_fofn_fn)
    make_task = PypeTask(
        inputs=next_inputs,
        outputs={
            "fofn": fasta_fofn_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_fastas2fofn)
    tasks.append(task)
    return tasks, fasta_fofn_pfn
示例#20
0
def task_run_blasr(self):
    job_done = fn(self.job_done)
    ref_fasta = fn(self.ref_fasta)
    read_fasta = fn(self.read_fasta)

    job_uid = self.parameters['job_uid']
    wd = self.parameters['wd']
    ctg_id = self.parameters['ctg_id']

    config = self.parameters['config']
    smrt_bin = config['smrt_bin']
    sge_blasr_aln = config['sge_blasr_aln']
    job_type = config['job_type']
    blasr = os.path.join(smrt_bin, 'blasr')
    samtools = os.path.join(smrt_bin, 'samtools')

    script_dir = os.path.join(wd)
    script_fn = os.path.join(script_dir,
                             'aln_{ctg_id}.sh'.format(ctg_id=ctg_id))

    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
cd {wd}
hostname
date
cd {wd}
time {blasr} {read_fasta} {ref_fasta} -noSplitSubreads -clipping subread\
 -hitPolicy randombest -randomSeed 42 -bestn 1 -minPctIdentity 70.0\
 -minMatch 12  -nproc 24 -sam -out tmp_aln.sam
{samtools} view -bS tmp_aln.sam | {samtools} sort - {ctg_id}_sorted
{samtools} index {ctg_id}_sorted.bam
rm tmp_aln.sam
date
touch {job_done}
""".format(**locals())

    with open(script_fn, 'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#21
0
def generate_read_to_ctg_map(self):
    rawread_id_file = fn(self.rawread_id_file)
    pread_id_file = fn(self.pread_id_file)
    read_to_contig_map = fn(self.read_to_contig_map)

    pread_did_to_rid = open(pread_id_file).read().split('\n')
    rid_to_oid = open(rawread_id_file).read().split('\n')

    asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data),
                     fn(self.ctg_paths))

    pread_to_contigs = {}

    with open(read_to_contig_map, 'w') as f:
        for ctg in asm_G.ctg_data:
            if ctg[-1] == 'R':
                continue
            ctg_g = asm_G.get_sg_for_ctg(ctg)
            for n in ctg_g.nodes():
                pid = int(n.split(':')[0])

                rid = pread_did_to_rid[pid].split('/')[1]
                rid = int(int(rid) / 10)
                oid = rid_to_oid[rid]
                k = (pid, rid, oid)
                pread_to_contigs.setdefault(k, set())
                pread_to_contigs[k].add(ctg)

        for k in pread_to_contigs:
            pid, rid, oid = k
            for ctg in list(pread_to_contigs[k]):
                print >> f, '%09d %09d %s %s' % (pid, rid, oid, ctg)
def task_run_quiver(self):
    ref_fasta = fn(self.ref_fasta)
    read_bam = fn(self.read_bam)

    cns_fasta = fn(self.cns_fasta)
    cns_fastq = fn(self.cns_fastq)
    job_done = fn(self.job_done)

    job_uid = self.parameters['job_uid']
    ctg_id = self.parameters['ctg_id']

    smrt_bin = self.parameters['smrt_bin']
    samtools = os.path.join(smrt_bin, 'samtools')
    pbalign = os.path.join(smrt_bin, 'pbalign')
    makePbi = os.path.join(smrt_bin, 'makePbi')
    variantCaller = os.path.join(smrt_bin, 'variantCaller')

    script_fn = 'cns_%s.sh' % (ctg_id)

    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
hostname
date

{samtools} faidx {ref_fasta}
{pbalign} --tmpDir=/localdisk/scratch/ --nproc=24 --minAccuracy=0.75 --minLength=50\
          --minAnchorSize=12 --maxDivergence=30 --concordant --algorithm=blasr\
          --algorithmOptions=--useQuality --maxHits=1 --hitPolicy=random --seed=1\
            {read_bam} {ref_fasta} aln-{ctg_id}.bam
({variantCaller} --algorithm=arrow -x 5 -X 120 -q 20 -j 24 -r {ref_fasta} aln-{ctg_id}.bam\
            -o {cns_fasta} -o {cns_fastq}) || echo quvier failed
date
touch {job_done}
""".format(**locals())

    with open(script_fn,'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#23
0
def say_hey0(self):
    o0 = fn(self.o0)
    print 'hey', o0
    script = """\
#!/bin/bash

echo hey0
touch %(o0)s
""" % locals()
    script_fn = 'run-hey.sh'
    with open(script_fn, 'w') as ofs:
        ofs.write(script)
    self.generated_script_fn = script_fn
示例#24
0
def task_run_blasr(self):
    job_done = fn(self.job_done)
    ref_fasta = fn(self.ref_fasta)
    read_fasta = fn(self.read_fasta)

    job_uid = self.parameters['job_uid']
    wd = self.parameters['wd']
    ctg_id = self.parameters['ctg_id']

    config = self.parameters['config']
    smrt_bin = config['smrt_bin']
    blasr = os.path.join(smrt_bin, 'blasr')
    samtools = os.path.join(smrt_bin, 'samtools')


    script_dir = os.path.join(wd)
    script_fn =  os.path.join(script_dir , 'aln_{ctg_id}.sh'.format(ctg_id = ctg_id))

    script = """\
set -vex
trap 'touch {job_done}.exit' EXIT
cd {wd}
hostname
date
cd {wd}
time {blasr} {read_fasta} {ref_fasta} --noSplitSubreads --clipping subread\
 --hitPolicy randombest --randomSeed 42 --bestn 1 --minPctIdentity 70.0\
 --minMatch 12  --nproc 24 --bam --out tmp_aln.bam
#{samtools} view -bS tmp_aln.sam | {samtools} sort - {ctg_id}_sorted
{samtools} sort tmp_aln.bam -o {ctg_id}_sorted.bam
{samtools} index {ctg_id}_sorted.bam
rm tmp_aln.bam
date
touch {job_done}
""".format(**locals())

    with open(script_fn,'w') as script_file:
        script_file.write(script)
    self.generated_script_fn = script_fn
示例#25
0
def create_quiver_jobs(scattered_quiver_plf):
    scattered_quiver_fn = fn(scattered_quiver_plf)
    jobs = json.loads(open(scattered_quiver_fn).read())
    #ctg_ids = sorted(jobs['ref_seq_data'])
    p_ctg_out = []
    h_ctg_out = []
    job_done_plfs = {}
    for job in jobs:
        ctg_id = job['ctg_id']
        m_ctg_id = ctg_id.split('-')[0]
        wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
        ref_fasta = makePypeLocalFile(
            os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id=ctg_id)))
        read_sam = makePypeLocalFile(
            os.path.join(
                os.getcwd(), './4-quiver/reads/'
                '{ctg_id}.sam'.format(ctg_id=ctg_id)))
        cns_fasta = makePypeLocalFile(
            os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id=ctg_id)))
        cns_fastq = makePypeLocalFile(
            os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id=ctg_id)))
        job_done = makePypeLocalFile(
            os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id=ctg_id)))

        if os.path.exists(
                fn(read_sam
                   )):  # TODO(CD): Ask Jason what we should do if missing SAM.
            if ctg_types[ctg_id] == 'p':
                p_ctg_out.append((fn(cns_fasta), fn(cns_fastq)))
            elif ctg_types[ctg_id] == 'h':
                h_ctg_out.append((fn(cns_fasta), fn(cns_fastq)))
            else:
                LOG.warning(
                    'Type is {!r}, not "p" or "h". Why are we running Quiver?'.
                    format(ctg_types[ctg_id]))
            parameters = {
                'job_uid': 'q-' + ctg_id,
                'wd': wd,
                'config': config,
                'ctg_id': ctg_id
            }
            make_quiver_task = PypeTask(
                inputs={
                    'ref_fasta': ref_fasta,
                    'read_sam': read_sam,
                    'scattered_quiver': scattered_quiver_plf,
                },
                outputs={
                    'cns_fasta': cns_fasta,
                    'cns_fastq': cns_fastq,
                    'job_done': job_done
                },
                parameters=parameters,
            )
            quiver_task = make_quiver_task(task_run_quiver)
            wf.addTask(quiver_task)
            job_done_plfs['{}'.format(ctg_id)] = job_done
    #sge_quiver = config['sge_quiver']
    return p_ctg_out, h_ctg_out, job_done_plfs
示例#26
0
def taskrun1(self):
    template = """
sleep_s=%(sleep_s)s
ifile=%(ifile)s
ofile=%(ofile)s

set -vex
echo start1
sleep ${sleep_s}
cp -f ${ifile} ${ofile}
echo end1
"""
    bash = template % dict(
        ifile=fn(self.f0),
        ofile=fn(self.f1),
        sleep_s=self.parameters['sleep_s'],
    )
    log.debug('taskrun1 bash:\n' + bash)
    script = 'taskrun1.sh'
    with open(script, 'w') as ofs:
        ofs.write(bash)
    #system("bash {}".format(script), check=True)
    self.generated_script_fn = script
    return script
def taskrun1(self):
    template = """
sleep_s=%(sleep_s)s
ifile=%(ifile)s
ofile=%(ofile)s

set -vex
echo start1
sleep ${sleep_s}
cp -f ${ifile} ${ofile}
echo end1
"""
    bash = template %dict(
        ifile=fn(self.f0),
        ofile=fn(self.f1),
        sleep_s=self.parameters['sleep_s'],
    )
    log.debug('taskrun1 bash:\n' + bash)
    script = 'taskrun1.sh'
    with open(script, 'w') as ofs:
        ofs.write(bash)
    #system("bash {}".format(script), check=True)
    self.generated_script_fn = script
    return script
示例#28
0
def create_tasks_pbalign(chunk_json_pfn, referenceset_pfn, parameters):
    """Create a pbalign task for each chunk, plus a gathering task.
    """
    tasks = list()
    gathering = dict()
    chunk_dir = os.path.dirname(fn(chunk_json_pfn))
    for i, subreadset_fn in enumerate(sorted(yield_pipeline_chunk_names_from_json(open(fn(chunk_json_pfn)), '$chunk.subreadset_id'))):
        wdir = 'run-pbalign-{:02d}'.format(i)
        subreadset_fn = os.path.join(chunk_dir, os.path.basename(subreadset_fn))
        subreadset_pfn = makePypeLocalFile(subreadset_fn)
        unmapped_pfn = makePypeLocalFile('{wdir}/unmapped.txt'.format(**locals()))
        alignmentset_pfn = makePypeLocalFile('{wdir}/align.subreads.{i:02d}.alignmentset.xml'.format(**locals()))
        gathering['unmapped_{:02d}'.format(i)] = unmapped_pfn
        gathering['alignmentsets_{:02d}'.format(i)] = alignmentset_pfn
        """Also produces:
        aligned.subreads.i.alignmentset.bam
        aligned.subreads.i.alignmentset.bam.bai
        aligned.subreads.i.alignmentset.bam.pbi
        """
        make_task = PypeTask(
                inputs = {"chunk_json": chunk_json_pfn,
                          "dataset": subreadset_pfn,
                          "referenceset": referenceset_pfn,
                },
                outputs = {"alignmentset": alignmentset_pfn,
                           "unmapped": unmapped_pfn,
                },
                parameters = parameters,
        )
        task = make_task(start_task.task_pbalign)
        tasks.append(task)
    o_alignmentset_pfn = makePypeLocalFile('run-pbalign_gather/aligned.subreads.alignmentset.xml')
    o_unmapped_pfn = makePypeLocalFile('run-pbalign_gather/unmapped.txt')
    make_task = PypeTask(
            inputs = gathering,
            outputs = {"o_ds": o_alignmentset_pfn,
                       "o_unmapped": o_unmapped_pfn,
            },
            parameters = parameters,
    )
    task = make_task(start_task.task_pbalign_gather)
    tasks.append(task)
    return tasks, alignmentset_pfn
示例#29
0
def task_cns_zcat(self):
    gathered_p_ctg_fn = fn(self.gathered_p_ctg)
    gathered_h_ctg_fn = fn(self.gathered_h_ctg)

    cns_p_ctg_fasta_fn = fn(self.cns_p_ctg_fasta)
    cns_p_ctg_fastq_fn = fn(self.cns_p_ctg_fastq)
    cns_h_ctg_fasta_fn = fn(self.cns_h_ctg_fasta)
    cns_h_ctg_fastq_fn = fn(self.cns_h_ctg_fastq)
    job_done_fn = fn(self.job_done)

    rm(cns_p_ctg_fasta_fn)
    touch(cns_p_ctg_fasta_fn)
    rm(cns_p_ctg_fastq_fn)
    touch(cns_p_ctg_fastq_fn)
    with open(gathered_p_ctg_fn) as ifs:
        for line in ifs:
            cns_fasta_fn, cns_fastq_fn = line.split()
            system('zcat {cns_fasta_fn} >> {cns_p_ctg_fasta_fn}'.format(**locals()))
            system('zcat {cns_fastq_fn} >> {cns_p_ctg_fastq_fn}'.format(**locals()))

    # comment out this for now for recovering purpose
    #with open(gathered_p_ctg_fn) as ifs:
    #    for line in ifs:
    #        cns_fasta_fn, cns_fastq_fn = line.split()
    #        rm(cns_fasta_fn)
    #        rm(cns_fasta_fn)

    rm(cns_h_ctg_fasta_fn)
    touch(cns_h_ctg_fasta_fn)
    rm(cns_h_ctg_fastq_fn)
    touch(cns_h_ctg_fastq_fn)
    with open(gathered_h_ctg_fn) as ifs:
        for line in ifs:
            cns_fasta_fn, cns_fastq_fn = line.split()
            system('zcat {cns_fasta_fn} >> {cns_h_ctg_fasta_fn}'.format(**locals()))
            system('zcat {cns_fastq_fn} >> {cns_h_ctg_fastq_fn}'.format(**locals()))

    # comment out this for now for recovering purpose
    #with open(gathered_h_ctg_fn) as ifs:
    #    for line in ifs:
    #        cns_fasta_fn, cns_fastq_fn = line.split()
    #        rm(cns_fasta_fn)
    #        rm(cns_fasta_fn)

    touch(job_done_fn)
示例#30
0
def task_cns_zcat(self):
    gathered_p_ctg_fn = fn(self.gathered_p_ctg)
    gathered_h_ctg_fn = fn(self.gathered_h_ctg)

    cns_p_ctg_fasta_fn = fn(self.cns_p_ctg_fasta)
    cns_p_ctg_fastq_fn = fn(self.cns_p_ctg_fastq)
    cns_h_ctg_fasta_fn = fn(self.cns_h_ctg_fasta)
    cns_h_ctg_fastq_fn = fn(self.cns_h_ctg_fastq)
    job_done_fn = fn(self.job_done)

    rm(cns_p_ctg_fasta_fn)
    touch(cns_p_ctg_fasta_fn)
    rm(cns_p_ctg_fastq_fn)
    touch(cns_p_ctg_fastq_fn)
    with open(gathered_p_ctg_fn) as ifs:
        for line in ifs:
            cns_fasta_fn, cns_fastq_fn = line.split()
            system('zcat {cns_fasta_fn} >> {cns_p_ctg_fasta_fn}'.format(
                **locals()))
            system('zcat {cns_fastq_fn} >> {cns_p_ctg_fastq_fn}'.format(
                **locals()))
    with open(gathered_p_ctg_fn) as ifs:
        for line in ifs:
            cns_fasta_fn, cns_fastq_fn = line.split()
            rm(cns_fasta_fn)
            rm(cns_fasta_fn)
    rm(cns_h_ctg_fasta_fn)
    touch(cns_h_ctg_fasta_fn)
    rm(cns_h_ctg_fastq_fn)
    touch(cns_h_ctg_fastq_fn)
    with open(gathered_h_ctg_fn) as ifs:
        for line in ifs:
            cns_fasta_fn, cns_fastq_fn = line.split()
            system('zcat {cns_fasta_fn} >> {cns_h_ctg_fasta_fn}'.format(
                **locals()))
            system('zcat {cns_fastq_fn} >> {cns_h_ctg_fastq_fn}'.format(
                **locals()))
    with open(gathered_h_ctg_fn) as ifs:
        for line in ifs:
            cns_fasta_fn, cns_fastq_fn = line.split()
            rm(cns_fasta_fn)
            rm(cns_fasta_fn)

    touch(job_done_fn)
def generate_read_to_ctg_map(self):
    rawread_id_file = fn( self.rawread_id_file )
    pread_id_file = fn( self.pread_id_file )
    read_to_contig_map = fn( self.read_to_contig_map )
    
    pread_did_to_rid = open(pread_id_file).read().split("\n")
    rid_to_oid = open(rawread_id_file).read().split("\n")

    h_ctg_edges = fn( self.h_ctg_edges )
    p_ctg_edges = fn( self.p_ctg_edges )

    h_ctg_ids = set()
    with open(fn(self.h_ctg_ids)) as f:
        for row in f:
            row = row.strip()
            h_ctg_ids.add( row )

    pread_to_contigs = {}

    for fnanme in ( p_ctg_edges, h_ctg_edges):
        with open(fnanme) as f:
            for row in f:
                row = row.strip().split()
                ctg = row[0]
                if len(ctg.split("_")) > 1 and ctg not in h_ctg_ids:
                    continue
                n1 = row[1]
                n2 = row[2]
                pid1 = int(n1.split(":")[0])
                pid2 = int(n2.split(":")[0])
                rid1 = pread_did_to_rid[pid1].split("/")[1]
                rid2 = pread_did_to_rid[pid2].split("/")[1]
                rid1 = int(int(rid1)/10)
                rid2 = int(int(rid2)/10)
                oid1 = rid_to_oid[rid1]
                oid2 = rid_to_oid[rid2]
                k1 = (pid1, rid1, oid1)
                pread_to_contigs.setdefault( k1, set() )
                pread_to_contigs[ k1 ].add( ctg )
                k2 = (pid2, rid2, oid2)
                pread_to_contigs.setdefault( k2, set() )
                pread_to_contigs[ k2 ].add( ctg )

    with open(read_to_contig_map, "w") as f:
        for k in pread_to_contigs:
            pid, rid, oid = k
            for ctg in list(pread_to_contigs[ k ]):
                print >>f, "%09d %09d %s %s" % (pid, rid, oid, ctg)
示例#32
0
def generate_read_to_hctg_map(self):
    rawread_id_file = fn(self.rawread_id_file)
    pread_id_file = fn(self.pread_id_file)
    read_to_contig_map = fn(self.read_to_contig_map)

    pread_did_to_rid = open(pread_id_file).read().split('\n')
    rid_to_oid = open(rawread_id_file).read().split('\n')

    h_ctg_edges = fn(self.h_ctg_edges)
    p_ctg_edges = fn(self.p_ctg_edges)

    h_ctg_ids = set()
    with open(fn(self.h_ctg_ids)) as f:
        for row in f:
            row = row.strip()
            h_ctg_ids.add(row)

    pread_to_contigs = {}

    for fnanme in (p_ctg_edges, h_ctg_edges):
        with open(fnanme) as f:
            for row in f:
                row = row.strip().split()
                ctg = row[0]
                if len(ctg.split('_')) > 1 and ctg not in h_ctg_ids:
                    continue
                n1 = row[1]
                n2 = row[2]
                pid1 = int(n1.split(':')[0])
                pid2 = int(n2.split(':')[0])
                rid1 = pread_did_to_rid[pid1].split('/')[1]
                rid2 = pread_did_to_rid[pid2].split('/')[1]
                rid1 = int(int(rid1) / 10)
                rid2 = int(int(rid2) / 10)
                oid1 = rid_to_oid[rid1]
                oid2 = rid_to_oid[rid2]
                k1 = (pid1, rid1, oid1)
                pread_to_contigs.setdefault(k1, set())
                pread_to_contigs[k1].add(ctg)
                k2 = (pid2, rid2, oid2)
                pread_to_contigs.setdefault(k2, set())
                pread_to_contigs[k2].add(ctg)

    with open(read_to_contig_map, 'w') as f:
        for k in pread_to_contigs:
            pid, rid, oid = k
            for ctg in list(pread_to_contigs[k]):
                print >> f, '%09d %09d %s %s' % (pid, rid, oid, ctg)
def create_quiver_jobs(wf, scattered_quiver_plf):
    scattered_quiver_fn = fn(scattered_quiver_plf)
    jobs = json.loads(open(scattered_quiver_fn).read())
    #ctg_ids = sorted(jobs['ref_seq_data'])
    p_ctg_out=[]
    h_ctg_out=[]
    job_done_plfs = {}
    for job in jobs:
        ctg_id = job['ctg_id']
        ctg_types = job['ctg_types']
        smrt_bin = job['smrt_bin']
        sge_option = job['sge_option']
        ref_fasta = makePypeLocalFile(job['ref_fasta'])
        read_bam = makePypeLocalFile(job['read_bam'])
        m_ctg_id = ctg_id.split('-')[0]
        wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
        #ref_fasta = makePypeLocalFile(os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id)))
        #read_bam = makePypeLocalFile(os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id)))
        cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
        cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
        job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))

        if os.path.exists(fn(read_bam)): # TODO(CD): Ask Jason what we should do if missing SAM.
            if ctg_types[ctg_id] == 'p':
                p_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
            elif ctg_types[ctg_id] == 'h':
                h_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
            else:
                LOG.warning('Type is {!r}, not "p" or "h". Why are we running Quiver?'.format(ctg_types[ctg_id]))
            parameters = {
                    'job_uid':'q-'+ctg_id,
                    'ctg_id': ctg_id,
                    'smrt_bin': smrt_bin,
                    'sge_option': sge_option,
            }
            make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_bam': read_bam,
                                         'scattered_quiver': scattered_quiver_plf,
                                       },
                                       outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
                                       parameters = parameters,
            )
            quiver_task = make_quiver_task(task_run_quiver)
            wf.addTask(quiver_task)
            job_done_plfs['{}'.format(ctg_id)] = job_done
    return p_ctg_out, h_ctg_out, job_done_plfs
def taskrun0(self):
    template = """
sleep_s=%(sleep_s)s
ofile=%(ofile)s

set -vex
echo start0
sleep ${sleep_s}
touch ${ofile}
echo end0
"""
    bash = template %dict(
        #ifile=fn(self.i0),
        ofile=fn(self.f0),
        sleep_s=self.parameters['sleep_s'],
    )
    log.debug('taskrun0 bash:\n' + bash)
    script = 'taskrun0.sh'
    with open(script, 'w') as ofs:
        ofs.write(bash)
    #system("bash {}".format(script), check=True)
    #spawn(['/bin/bash', script], check=True) # Beware! Hard to kill procs.
    self.generated_script_fn = script
    return script
示例#35
0
def taskrun0(self):
    template = """
sleep_s=%(sleep_s)s
ofile=%(ofile)s

set -vex
echo start0
sleep ${sleep_s}
touch ${ofile}
echo end0
"""
    bash = template % dict(
        #ifile=fn(self.i0),
        ofile=fn(self.f0),
        sleep_s=self.parameters['sleep_s'],
    )
    log.debug('taskrun0 bash:\n' + bash)
    script = 'taskrun0.sh'
    with open(script, 'w') as ofs:
        ofs.write(bash)
    #system("bash {}".format(script), check=True)
    #spawn(['/bin/bash', script], check=True) # Beware! Hard to kill procs.
    self.generated_script_fn = script
    return script
示例#36
0
def create_tasks_gc(fofn_pfn, referenceset_pfn, parameters):
    """Create a gc task for each chunk, plus a gathering task.
    Here is the convoluted workflow:
    1. For each gc instance "chunk":
      A. variantCaller writes .fasta
      B. We create a contigset for the .fasta
    2. We keep the contigset output filenames in a FOFN (from run_gc_scatter)
       and pass that to run_gc_gather().
    3. We read each contigset and add them to a gathered ContigSet.
    4. We "consolidate" their underlying .fasta "resources",
       assuming their filenames match except extenion.
    5. Finally, we write the gathered contigset.
    Whew!
    We also gather fastq here, for convenience.
    """
    tasks = list()
    contigsets = dict()
    fastqs = dict()
    # Assume fofn of gc chunks are all relative to the dir of the fofn.
    for i, alignmentset_bn in enumerate(open(fn(fofn_pfn)).read().split()):
        alignmentset_fn = os.path.join(os.path.dirname(fn(fofn_pfn)), alignmentset_bn)
        wdir = 'run-gc-{:02}'.format(i)
        mkdirs(wdir) # Assume CWD is correct.
        alignmentset_pfn = makePypeLocalFile(alignmentset_fn) # New pfn cuz it was not pfn before.
        polished_fastq_pfn = makePypeLocalFile(os.path.join(wdir, 'consensus.fastq'))
        variants_gff_pfn = makePypeLocalFile(os.path.join(wdir, 'variants.gff'))
        consensus_contigset_pfn = makePypeLocalFile(os.path.join(wdir, 'consensus.contigset.xml'))
        """Also produces:
        consensus.fasta
        consensus.fasta.fai

        And note that these files names are important, as pbcoretools gathering expects
        a particular pattern.
        """
        contigsets['contigset_{:02d}'.format(i)] = consensus_contigset_pfn
        fastqs['fastq_{:02d}'.format(i)] = polished_fastq_pfn
        make_task = PypeTask(
                inputs = {"alignmentset": alignmentset_pfn,
                          "referenceset": referenceset_pfn,},
                outputs = {
                    "polished_fastq": polished_fastq_pfn,
                    "variants_gff": variants_gff_pfn,
                    "consensus_contigset": consensus_contigset_pfn,
                },
                parameters = parameters,
        )
        task = make_task(start_task.task_genomic_consensus)
        tasks.append(task)
    contigset_pfn = makePypeLocalFile('run-gc-gather/contigset.xml')
    gathered_fastq_pfn = makePypeLocalFile('run-gc-gather/gathered.fastq')
    inputs = dict(contigsets)
    inputs.update(fastqs)
    log.debug('inputs to gc_gather:{}'.format(pprint.pformat(contigsets)))
    make_task = PypeTask(
            inputs = inputs,
            outputs = {"ds_out": contigset_pfn,
                       "fastq_out": gathered_fastq_pfn,
            },
            parameters = parameters,
    )
    task = make_task(start_task.task_gc_gather)
    tasks.append(task)
    return tasks, contigset_pfn, gathered_fastq_pfn
示例#37
0
文件: run1.py 项目: gsc0107/FALCON
def run(
    wf,
    config,
    input_config_fn,
    input_fofn_plf,
):
    """
    Preconditions (for now):
    * fc_run_logger
    * run_support.logger
    """
    rawread_dir = os.path.abspath('./0-rawreads')
    pread_dir = os.path.abspath('./1-preads_ovl')
    falcon_asm_dir = os.path.abspath('./2-asm-falcon')
    script_dir = os.path.abspath('./scripts')
    sge_log_dir = os.path.abspath('./sge_log')

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    exitOnFailure = config[
        'stop_all_jobs_on_failure']  # only matter for parallel jobs
    wf.max_jobs = config['default_concurrent_jobs']

    rawread_fofn_plf = makePypeLocalFile(
        os.path.join(rawread_dir, 'raw-fofn-abs',
                     os.path.basename(config['input_fofn'])))
    make_fofn_abs_task = PypeTask(
        inputs={'i_fofn': input_fofn_plf},
        outputs={'o_fofn': rawread_fofn_plf},
        parameters={},
    )
    fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config['input_type'] == 'raw':
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile(os.path.join(rawread_dir, 'sleep_done'))
        rdb_build_done = makePypeLocalFile(
            os.path.join(rawread_dir, 'rdb_build_done'))
        run_jobs = makePypeLocalFile(os.path.join(rawread_dir, 'run_jobs.sh'))
        parameters = {
            'work_dir': rawread_dir,
            'sge_option': config['sge_option_da'],
            'config_fn': input_config_fn,
            'config': config
        }

        length_cutoff_plf = makePypeLocalFile(
            os.path.join(rawread_dir, 'length_cutoff'))
        raw_reads_db_plf = makePypeLocalFile(
            os.path.join(rawread_dir, '%s.db' % 'raw_reads'))
        make_build_rdb_task = PypeTask(
            inputs={'input_fofn': rawread_fofn_plf},
            outputs={
                'rdb_build_done': rdb_build_done,
                'raw_reads_db': raw_reads_db_plf,
                'length_cutoff': length_cutoff_plf,
                'run_jobs': run_jobs,
            },
            parameters=parameters,
        )
        build_rdb_task = make_build_rdb_task(pype_tasks.task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done])

        raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf))
        #### run daligner
        wf.max_jobs = config['da_concurrent_jobs']
        scattered_plf = os.path.join(rawread_dir, 'daligner-scatter',
                                     'scattered.json')
        make_daligner_scatter = PypeTask(
            inputs={
                'run_jobs_fn': run_jobs,
                'db_build_done': rdb_build_done,
            },
            outputs={
                'scatter_fn': scattered_plf,
            },
            parameters={
                'db_prefix': 'raw_reads',
                'nblock': raw_reads_nblock,
                'pread_aln': False,
                'config': config,
            },
        )
        task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        daligner_tasks, daligner_out = create_daligner_tasks(
            rawread_dir, scattered_plf)

        wf.addTasks(daligner_tasks)
        r_gathered_las_plf = makePypeLocalFile(
            os.path.join(rawread_dir, 'raw-gather', 'gathered_las.txt'))

        parameters = {
            'nblock': raw_reads_nblock,
        }
        make_daligner_gather = PypeTask(
            inputs=daligner_out,
            outputs={'gathered': r_gathered_las_plf},
            parameters=parameters,
        )
        check_r_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
        wf.addTask(check_r_da_task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        # Merge .las files.
        wf.max_jobs = config['la_concurrent_jobs']
        scattered_plf = os.path.join(rawread_dir, 'merge-scatter',
                                     'scattered.json')
        make_task = PypeTask(
            inputs={
                'run_jobs': run_jobs,
                'gathered_las': r_gathered_las_plf,
            },
            outputs={
                'scattered': scattered_plf,
            },
            parameters={
                'db_prefix': 'raw_reads',
                'config': config,
            },
        )
        task = make_task(pype_tasks.task_merge_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        merge_tasks, p_ids_merged_las = create_merge_tasks(
            rawread_dir, scattered_plf)
        wf.addTasks(merge_tasks)
        task, _, las_fopfn_plf = create_merge_gather_task(
            os.path.join(rawread_dir, 'merge-gather'), p_ids_merged_las)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        if config['target'] == 'overlapping':
            sys.exit(0)

        # Produce new FOFN of preads fasta, based on consensus of overlaps.
        wf.max_jobs = config['cns_concurrent_jobs']

        scattered_plf = os.path.join(rawread_dir, 'cns-scatter',
                                     'scattered.json')
        make_task = PypeTask(
            inputs={
                'gathered': las_fopfn_plf,
                'db': raw_reads_db_plf,
            },
            outputs={
                'scattered': scattered_plf,
            },
            parameters={
                'db_prefix': 'raw_reads',
                'config': config,
            },
        )
        task = make_task(pype_tasks.task_consensus_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        tasks, consensus_out = create_consensus_tasks(rawread_dir,
                                                      scattered_plf)
        wf.addTasks(tasks)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        task, preads_fofn_plf = create_consensus_gather_task(
            os.path.join(rawread_dir, 'preads'), consensus_out)
        wf.addTask(task)

        rdir = os.path.join(rawread_dir, 'report')
        pre_assembly_report_plf = makePypeLocalFile(
            os.path.join(rdir, 'pre_assembly_stats.json'))
        parameters = dict(config)
        parameters['cwd'] = rdir
        make_task = PypeTask(
            inputs={
                'length_cutoff_fn': length_cutoff_plf,
                'raw_reads_db': raw_reads_db_plf,
                'preads_fofn': preads_fofn_plf,
            },
            outputs={
                'pre_assembly_report': pre_assembly_report_plf,
            },
            parameters=parameters,
        )
        task = make_task(pype_tasks.task_report_pre_assembly)
        wf.addTask(task)

        wf.refreshTargets(exitOnFailure=exitOnFailure)

    if config['target'] == 'pre-assembly':
        log.info('Quitting after stage-0 for "pre-assembly" target.')
        sys.exit(0)

    # build pread database
    if config['input_type'] == 'preads':
        preads_fofn_plf = makePypeLocalFile(
            os.path.join(pread_dir, 'preads-fofn-abs',
                         os.path.basename(config['input_fofn'])))
        make_fofn_abs_task = PypeTask(
            inputs={'i_fofn': rawread_fofn_plf},
            outputs={'o_fofn': preads_fofn_plf},
            parameters={},
        )
        fofn_abs_task = make_fofn_abs_task(
            pype_tasks.task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile(
        os.path.join(pread_dir, 'pdb_build_done'))
    parameters = {
        'work_dir': pread_dir,
        'sge_option': config['sge_option_pda'],
        'config_fn': input_config_fn,
        'config': config
    }

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    preads_db = makePypeLocalFile(os.path.join(
        pread_dir, 'preads.db'))  # Also .preads.*, of course.
    make_build_pdb_task = PypeTask(
        inputs={'preads_fofn': preads_fofn_plf},
        outputs={
            'pdb_build_done': pdb_build_done,
            'preads_db': preads_db,
            'run_jobs': run_jobs,
        },
        parameters=parameters,
    )
    build_pdb_task = make_build_pdb_task(pype_tasks.task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done])

    preads_nblock = support.get_nblock(fn(preads_db))
    #### run daligner
    wf.max_jobs = config['pda_concurrent_jobs']
    config['sge_option_da'] = config['sge_option_pda']

    scattered_plf = os.path.join(pread_dir, 'daligner-scatter',
                                 'scattered.json')
    make_daligner_scatter = PypeTask(
        inputs={
            'run_jobs_fn': run_jobs,
            'db_build_done': pdb_build_done,
        },
        outputs={
            'scatter_fn': scattered_plf,
        },
        parameters={
            'db_prefix': 'preads',
            'nblock': preads_nblock,
            'pread_aln': True,
            'config': config,
        },
    )
    task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
    wf.addTask(task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    daligner_tasks, daligner_out = create_daligner_tasks(
        pread_dir, scattered_plf)
    wf.addTasks(daligner_tasks)

    p_gathered_las_plf = makePypeLocalFile(
        os.path.join(pread_dir, 'gathered-las', 'gathered-las.txt'))
    parameters = {
        'nblock': preads_nblock,
    }
    make_daligner_gather = PypeTask(
        inputs=daligner_out,
        outputs={'gathered': p_gathered_las_plf},
        parameters=parameters,
    )
    check_p_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
    wf.addTask(check_p_da_task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    # Merge .las files.
    wf.max_jobs = config['pla_concurrent_jobs']
    config['sge_option_la'] = config['sge_option_pla']
    scattered_plf = os.path.join(pread_dir, 'merge-scatter', 'scattered.json')
    make_task = PypeTask(
        inputs={
            'run_jobs': run_jobs,
            'gathered_las': p_gathered_las_plf,
        },
        outputs={
            'scattered': scattered_plf,
        },
        parameters={
            'db_prefix': 'preads',
            'config': config,
        },
    )
    task = make_task(pype_tasks.task_merge_scatter)
    wf.addTask(task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    merge_tasks, p_ids_merged_las = create_merge_tasks(pread_dir,
                                                       scattered_plf)
    wf.addTasks(merge_tasks)
    task, las_fofn_plf, las_fopfn_plf = create_merge_gather_task(
        os.path.join(pread_dir, 'merge-gather'), p_ids_merged_las)
    wf.addTask(task)

    wf.refreshTargets(exitOnFailure=exitOnFailure)

    # Draft assembly (called 'fc_' for now)
    wf.max_jobs = config['fc_concurrent_jobs']
    db2falcon_dir = os.path.join(pread_dir, 'db2falcon')
    db2falcon_done = makePypeLocalFile(
        os.path.join(db2falcon_dir, 'db2falcon_done'))
    preads4falcon_plf = makePypeLocalFile(
        os.path.join(db2falcon_dir, 'preads4falcon.fasta'))
    make_run_db2falcon = PypeTask(
        inputs={
            'las_fofn_plf': las_fofn_plf,
            'preads_db': preads_db,
        },
        outputs={
            'db2falcon_done': db2falcon_done,
            'preads4falcon': preads4falcon_plf,
        },
        parameters={
            'wd': db2falcon_dir,
            'config': config,
            'sge_option': config['sge_option_fc'],
        },
    )
    wf.addTask(make_run_db2falcon(pype_tasks.task_run_db2falcon))

    falcon_asm_done = makePypeLocalFile(
        os.path.join(falcon_asm_dir, 'falcon_asm_done'))
    make_run_falcon_asm = PypeTask(
        inputs={
            'db2falcon_done': db2falcon_done,
            'db_file': preads_db,
            'preads4falcon': preads4falcon_plf,
            'las_fofn': las_fofn_plf,
        },
        outputs={'falcon_asm_done': falcon_asm_done},
        parameters={
            'wd': falcon_asm_dir,
            'config': config,
            'pread_dir': pread_dir,
            'sge_option': config['sge_option_fc'],
        },
    )
    wf.addTask(make_run_falcon_asm(pype_tasks.task_run_falcon_asm))
    wf.refreshTargets()

    return falcon_asm_done
示例#38
0
def dump_pread_ids(self):
    pread_db = fn(self.pread_db)
    pread_id_file = fn(self.pread_id_file)
    os.system(
        "DBshow -n %s | tr -d '>' | LD_LIBRARY_PATH= awk '{print $1}' > %s" %
        (pread_db, pread_id_file))
示例#39
0
def flow(config):
    #import pdb; pdb.set_trace()
    parameters = config
    #exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
    #wf.refreshTargets(exitOnFailure=exitOnFailure)
    #wf = PypeThreadWorkflow()
    #wf = PypeWorkflow()
    #wf = PypeWorkflow(job_type='local')
    log.debug('config=\n{}'.format(pprint.pformat(config)))
    # Set some defaults on the Workflow.
    concurrent_jobs = 24  # TODO: Configure this.
    wf = PypeWorkflow(
        job_type=config['hgap'].get('job_type'),
        job_queue=config['hgap'].get('job_queue'),
        watcher_type=config['hgap'].get('pwatcher_type', 'blocking'),
        #watcher_directory=config['pwatcher_directory'],
        max_jobs=config['hgap'].get('max_jobs', concurrent_jobs),
    )

    use_tmpdir = config['hgap'].get('use_tmpdir')
    if use_tmpdir:
        log.info('hgap.use_tmpdir={!r}'.format(use_tmpdir))
        if use_tmpdir is not True and '/' in use_tmpdir:
            tempfile.tempdir = use_tmpdir
            log.info('Using tempfile.tempdir={}'.format(tempfile.tempdir))
        else:
            log.info('Keeping tempfile.tempdir={}'.format(tempfile.tempdir))

    dataset_pfn = makePypeLocalFile(config['pbsmrtpipe']['input_files'][0])
    filtered_pfn = makePypeLocalFile('run-filterbam/filtered.subreadset.xml')
    make_task = PypeTask(
        inputs={
            "dataset": dataset_pfn,
        },
        outputs={
            "filtered": filtered_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_filterbam)
    wf.addTask(task)

    split_subreadsets_fofn_pfn = makePypeLocalFile(
        'run-bam_scatter/chunked_subreadsets.fofn')
    make_task = PypeTask(
        inputs={
            "dataset": filtered_pfn,
        },
        outputs={
            "split_subreadsets_fofn": split_subreadsets_fofn_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_bam_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    tasks, input_fofn_pfn = create_tasks_fasta2DB(split_subreadsets_fofn_pfn,
                                                  parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    fc_cfg_pfn = makePypeLocalFile('run-falcon/fc.cfg')
    fc_json_config_pfn = makePypeLocalFile('run-falcon/fc.json')
    make_task = PypeTask(
        inputs={
            "input_fofn": input_fofn_pfn,
        },
        outputs={
            "fc_cfg": fc_cfg_pfn,
            "fc_json_config": fc_json_config_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_prepare_falcon)
    wf.addTask(task)
    wf.refreshTargets()

    input_config_fn = fn(fc_cfg_pfn)
    with sys.cd('run-falcon'):
        falcon_kit.mains.run1.fc_run_logger = falcon_kit.run_support.logger = logging.getLogger(
            'falcon')
        fc_cfg = falcon_kit.run_support.get_dict_from_old_falcon_cfg(
            falcon_kit.run_support.parse_config(input_config_fn))
        # FALCON takes over the workflow for a while.
        # (For debugging, it is still possible to restart just fc_run, if desired.)
        falcon_asm_done_pfn = falcon_kit.mains.run1.run(
            wf,
            fc_cfg,
            input_config_fn,
            input_fofn_plf=input_fofn_pfn,  # _pfn should be _plf, but oh well
        )
        wf.max_jobs = concurrent_jobs  # in case Falcon changed this

    # Here is a hard-linking task to help us attach falcon into the dependency graph.
    falcon_link_done_pfn = makePypeLocalFile(
        'run-falcon_link/falcon_link_done')
    make_task = PypeTask(
        inputs={
            "falcon_asm_done": falcon_asm_done_pfn,
        },
        outputs={
            "falcon_link_done": falcon_link_done_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_falcon_link)
    wf.addTask(task)

    # The rest of the workflow will operate on datasets, not fasta directly.
    referenceset_pfn = makePypeLocalFile(
        'run-fasta2referenceset/asm.referenceset.xml')
    make_task = PypeTask(
        inputs={
            "falcon_link_done": falcon_link_done_pfn,
        },
        outputs={
            "referenceset": referenceset_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_fasta2referenceset)
    wf.addTask(task)
    wf.refreshTargets()

    # scatter the subreads for pbalign
    """Produces:
    pbalign_chunk.json
    chunk_subreadset_*.subreadset.xml
    """
    pbalign_chunk_json_pfn = makePypeLocalFile(
        'run-pbalign-scatter/pbalign_chunk.json')
    make_task = PypeTask(
        inputs={
            "dataset": dataset_pfn,
            "referenceset": referenceset_pfn,
        },
        outputs={
            "out_json": pbalign_chunk_json_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_pbalign_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    # After scattering, we can specify the pbalign jobs.
    tasks, alignmentset_pfn = create_tasks_pbalign(pbalign_chunk_json_pfn,
                                                   referenceset_pfn,
                                                   parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    # scatter the alignmentset for genomic_consensus (variantCaller)
    """Produces:
    gc.chunks.fofn
    ???*.congitset.xml ???
    """
    gc_chunks_fofn_pfn = makePypeLocalFile('run-gc_scatter/gc.chunks.fofn')
    make_task = PypeTask(
        inputs={
            "alignmentset": alignmentset_pfn,
            "referenceset": referenceset_pfn,
        },
        outputs={
            "out_fofn": gc_chunks_fofn_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_gc_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    tasks, contigset_pfn, gathered_fastq_pfn = create_tasks_gc(
        gc_chunks_fofn_pfn, referenceset_pfn, parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    # Final report

    polished_assembly_report_json_pfn = makePypeLocalFile(
        'run-polished-assembly-report/polished_assembly_report.json')
    make_task = PypeTask(
        inputs={
            "referenceset": referenceset_pfn,
            "gathered_alignmentset": alignmentset_pfn,
            "polished_fastq": gathered_fastq_pfn,
        },
        outputs={
            "report_json": polished_assembly_report_json_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_polished_assembly_report)
    wf.addTask(task)

    wf.refreshTargets()

    par_dir = os.path.dirname(fn(polished_assembly_report_json_pfn))
    sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality.png'))
    sys.symlink(os.path.join(par_dir,
                             'polished_coverage_vs_quality_thumb.png'))
    #return
    ##############

    if not os.path.exists('foo.bar1'):
        sys.system('touch foo.bar1')
    foo_fn1 = makePypeLocalFile('foo.bar1')
    foo_fn2 = makePypeLocalFile('foo.bar2')
    make_task = PypeTask(
        inputs={
            "foo1": foo_fn1,
        },
        outputs={
            "foo2": foo_fn2,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_foo)
    wf.addTask(task)
    wf.refreshTargets()
def task_gather_quiver(self):
    """We wrote the "gathered" files during task construction.
    """
    job_done_fn = fn(self.job_done)
    touch(job_done_fn)
示例#41
0
def get_phased_blocks(self):
    vmap_fn = fn(self.vmap_file)
    atable_fn = fn(self.atable_file)
    p_variant_fn = fn(self.phased_variant_file)

    left_connect = {}
    right_connect = {}

    c_score = {}
    states = {}
    positions = set()



    ref_base = {}
    with open(vmap_fn) as f:
        for l in f:
            l = l.strip().split()
            pos = int(l[0])
            ref_b = l[1]
            v_b = l[2]
            q_id = int(l[3])
            ref_base[pos] = ref_b

    with open(atable_fn) as f:
        for l in f:
            l = l.strip().split()
            pos1, b11, b12, pos2, b21, b22, s11, s12, s21, s22 = l
            s11, s12, s21, s22 = int(s11), int(s12), int(s21), int(s22)
            if abs(s11+s22-s12-s21) < 6:
                continue
            pos1 = int(pos1)
            pos2 = int(pos2)
            positions.add(pos1)
            positions.add(pos2)
            right_connect.setdefault(pos1, [])
            right_connect[pos1].append(pos2)
            left_connect.setdefault(pos2, [])
            left_connect[pos2].append(pos1)
            c_score[ (pos1, pos2) ] = { (b11+b21, b12+b22): s11 + s22, (b12+b22, b11+b21): s11 + s22,
                                        (b12+b21, b11+b22): s12 + s21, (b11+b22, b12+b21): s12 + s21 }


            if pos1 not in states:
                st1 = (b11, b12)
                st2 = (b12, b11)
                score1 = 0
                score2 = 0
                for pp in left_connect.get(pos1,[]):
                    if pp in states:
                        st0 = states[pp]
                    else:
                        continue
                    score1 += get_score( c_score, pp, pos1, st0, st1 )
                    score2 += get_score( c_score, pp, pos1, st0, st2 )

                for pp in right_connect.get(pos1,[]):
                    if pp in states:
                        st0 = states[pp]
                    else:
                        continue
                    score1 += get_score( c_score, pos1, pp, st1, st0 )
                    score2 += get_score( c_score, pos1, pp, st2, st0 )

                if score1 >= score2:
                    states[pos1] = st1
                else:
                    states[pos1] = st2

            if pos2 not in states:
                st1 = (b21, b22)
                st2 = (b22, b21)
                score1 = 0
                score2 = 0
                for pp in left_connect.get(pos2,[]):
                    if pp in states:
                        st0 = states[pp]
                    else:
                        continue
                    score1 += get_score( c_score, pp, pos2, st0, st1 )
                    score2 += get_score( c_score, pp, pos2, st0, st2 )

                for pp in right_connect.get(pos2,[]):
                    if pp in states:
                        st0 = states[pp]
                    else:
                        continue
                    score1 += get_score( c_score, pos2, pp, st1, st0 )
                    score2 += get_score( c_score, pos2, pp, st2, st0 )

                if score1 >= score2:
                    states[pos2] = st1
                else:
                    states[pos2] = st2

    positions = list(positions)
    positions.sort()


    iter_count = 0
    while 1:
        iter_count += 1
        if iter_count > 10:
            break
        update_count = 0
        for p in positions:
            b1, b2 = states[p]
            st1 = (b1, b2)
            st2 = (b2, b1)

            score1 = 0
            score2 = 0
            for pp in left_connect.get(p,[]):
                st0 = states[pp]
                score1 += get_score( c_score, pp, p, st0 ,st1)
                score2 += get_score( c_score, pp, p, st0, st2)

            #for pp in right_connect.get(p,[]):
            #    st0 = states[pp]
            #    score1 += get_score( c_score, p, pp, st1 ,st0)
            #    score2 += get_score( c_score, p, pp, st2, st0)

            if score1 >= score2:
                states[p] = st1
            else:
                states[p] = st2
                update_count += 1
        if update_count == 0:
            break


    right_extent = {}
    right_score = {}
    left_extent = {}
    left_score = {}


    for p in positions:

        left_extent[p] = p
        left_score[p] = 0
        if p in left_connect:
            left = p
            st0 = states[p]
            st0_ = st0[1], st0[0]
            for pp in left_connect[p]:
                st1 = states[pp]
                s = get_score( c_score, pp, p, st1, st0)
                s_ = get_score( c_score, pp, p, st1, st0_)
                left_score[p] += s - s_
                if s - s_ > 0 and pp < left:
                    left = pp
            left_extent[p] = left

        right_extent[p] = p
        right_score[p] = 0
        if p in right_connect:
            right = p
            st0 = states[p]
            st0_ = st0[1], st0[0]
            for pp in right_connect[p]:
                st1 = states[pp]
                s = get_score( c_score, p, pp, st0, st1)
                s_ = get_score( c_score, p, pp, st0_, st1)
                right_score[p] += s - s_
                if s - s_ > 0 and pp > right:
                    right = pp
            right_extent[p] = right




    phase_block_id = 1
    phase_blocks = {}
    pb = []

    max_right_ext = 0
    for p in positions:
        if right_score[p] < 10 or left_score[p] < 10:
            continue
        b1, b2 = states[p]
        if max_right_ext < left_extent[p]:
            if len(pb) > 3:
                phase_blocks[phase_block_id] = pb
                phase_block_id += 1
            pb = []
        pb.append( (p, b1, b2) )
        if right_extent[p] > max_right_ext:
            max_right_ext =  right_extent[p]
    if len(pb) > 3:
        phase_blocks[phase_block_id] = pb
    else:
        phase_block_id -= 1


    with open(p_variant_fn, "w") as out_f:
        for pid in xrange(1, phase_block_id+1):
            if len(phase_blocks[pid]) == 0:
                continue
            min_ = min( [x[0] for x in phase_blocks[pid]] )
            max_ = max( [x[0] for x in phase_blocks[pid]] )

            print >>out_f, "P", pid, min_, max_, max_ - min_, len(phase_blocks[pid]), 1.0 * (max_-min_)/len(phase_blocks[pid])
            for p, b1, b2 in phase_blocks[pid]:
                rb = ref_base[p]
                print >>out_f, "V", pid, p, "%d_%s_%s" % (p,rb,b1), "%d_%s_%s" % (p,rb,b2), left_extent[p], right_extent[p], left_score[p], right_score[p]
示例#42
0
def generate_association_table(self):

    vmap_fn = fn(self.vmap_file)
    atable_fn = fn(self.atable_file)
    ctg_id = self.parameters["ctg_id"]
    base_dir = self.parameters["base_dir"]

    vmap = {}
    v_positions = []

    with open(vmap_fn) as f:
        for l in f:
            l = l.strip().split()
            pos = int(l[0])
            ref_b = l[1]
            v_b = l[2]
            q_id = int(l[3])
            if (pos, ref_b) not in vmap:
                v_positions.append( (pos, ref_b) )
            vmap.setdefault( (pos, ref_b), {} )
            vmap[ (pos, ref_b) ].setdefault(v_b, [])
            vmap[ (pos, ref_b) ][v_b].append( q_id )


    #xary = []
    #yary = []
    with open(atable_fn, "w") as out_f:
        for i1 in xrange(len(v_positions)):
            link_count = 0
            for i2 in xrange(i1+1, len(v_positions)):
                pos1, rb1 = v_positions[i1]
                pos2, rb2 = v_positions[i2]
                if pos2 - pos1 > (1 << 16):
                    continue
                ct = {}
                p1table = []
                p2table = []
                s1 = 0
                list1 = vmap[ (pos1, rb1) ].items()
                for b1, qids1 in list1:
                    p1table.append( (b1, len(qids1) ) )
                    s1 += len(qids1)

                s2 = 0
                list2 = vmap[ (pos2, rb2) ].items()
                for b2, qids2 in list2:
                    p2table.append( (b2, len(qids2) ) )
                    s2 += len(qids2)

                total_s = 0
                for b1, qids1 in list1:
                    for b2, qids2 in list2:
                        s = len(set(qids1) & set(qids2))
                        ct[(b1,b2)] = s
                        total_s += s
                if total_s < 6:
                    continue

                b11 = p1table[0][0]
                b12 = p1table[1][0]
                b21 = p2table[0][0]
                b22 = p2table[1][0]
                print >> out_f, pos1, b11, b12, pos2, b21, b22, ct[(b11,b21)], ct[(b11,b22)], ct[(b12,b21)], ct[(b12,b22)]


                #xary.append(pos1)
                #yary.append(pos2)
                link_count += 1
                if link_count > 500:
                    break
示例#43
0
def make_het_call(self):

    bam_fn = fn(self.bam_file)
    ctg_id = self.parameters["ctg_id"]
    ref_seq = self.parameters["ref_seq"]
    base_dir = self.parameters["base_dir"]
    samtools = self.parameters["samtools"]
    vmap_fn = fn(self.vmap_file)
    vpos_fn = fn(self.vpos_file)
    q_id_map_fn = fn(self.q_id_map_file)


    # maybe we should check if the samtools path is valid
    p = subprocess.Popen(shlex.split("%s view %s %s" % (samtools, bam_fn, ctg_id) ), stdout=subprocess.PIPE)
    pileup = {}
    q_id_map = {}
    q_max_id = 0
    q_id = 0
    q_name_to_id = {}

    try:
        os.makedirs("%s/%s" % (base_dir, ctg_id))
    except OSError:
        pass

    vmap = open(vmap_fn, "w")
    vpos = open(vpos_fn, "w")

    for l in p.stdout:
        l = l.strip().split()
        if l[0][0] == "@":
            continue

        QNAME = l[0]
        if QNAME not in q_name_to_id:
            q_id = q_max_id
            q_name_to_id[QNAME] = q_id
            q_max_id += 1

        q_id = q_name_to_id[QNAME]
        q_id_map[q_id] = QNAME
        FLAG = int(l[1])
        RNAME = l[2]
        POS = int(l[3]) - 1 # convert to zero base
        CIGAR = l[5]
        SEQ = l[9]
        rp = POS
        qp = 0

        skip_base = 0
        total_aln_pos = 0
        for m in re.finditer(cigar_re, CIGAR):
            adv = int(m.group(1))
            total_aln_pos += adv

            if m.group(2)  == "S":
                skip_base += adv

        if 1.0 - 1.0 * skip_base / total_aln_pos < 0.1:
            continue
        if total_aln_pos < 2000:
            continue

        for m in re.finditer(cigar_re, CIGAR):
            adv = int(m.group(1))
            if m.group(2) == "S":
                qp += adv
            if m.group(2) in ("M", "=", "X"):
                matches = []
                for i in range(adv):
                    matches.append( (rp, SEQ[qp]) )
                    rp += 1
                    qp += 1
                for pos, b in  matches:
                    pileup.setdefault(pos, {})
                    pileup[pos].setdefault(b, [])
                    pileup[pos][b].append(q_id)
            elif m.group(2) == "I":
                for i in range(adv):
                    qp += 1
            elif m.group(2) == "D":
                for i in range(adv):
                    rp += 1

        pos_k = pileup.keys()
        pos_k.sort()
        th = 0.25
        for pos in pos_k:
            if pos < POS:
                if len(pileup[pos]) < 2:
                    del pileup[pos]
                    continue
                base_count = []
                total_count = 0
                for b in ["A", "C", "G", "T"]:
                    count = len(pileup[pos].get(b,[]))
                    base_count.append( (count, b) )
                    total_count += count
                if total_count < 10:
                    del pileup[pos]
                    continue

                base_count.sort()
                base_count.reverse()
                p0 = 1.0 *  base_count[0][0] / total_count
                p1 = 1.0 *  base_count[1][0] / total_count
                if p0 < 1.0 - th and p1 > th:
                    b0 = base_count[0][1]
                    b1 = base_count[1][1]
                    ref_base = ref_seq[pos]
                    print >> vpos, pos+1, ref_base, total_count, " ".join(["%s %d" % (x[1], x[0]) for x in base_count])
                    for q_id_ in pileup[pos][b0]:
                        print >> vmap, pos+1, ref_base, b0, q_id_
                    for q_id_ in pileup[pos][b1]:
                        print >> vmap, pos+1, ref_base, b1, q_id_
                del pileup[pos]


    q_id_map_f = open(q_id_map_fn, "w")
    for q_id, q_name in q_id_map.items():
        print >> q_id_map_f, q_id, q_name
def dump_pread_to_ctg(self):
    pread_db = fn( self.pread_db )
    rawread_id_file = fn( self.rawread_id_file )
    pread_id_file = fn( self.pread_id_file )
    phased_read_file = fn( self.phased_reads)
    read_to_contig_map = fn( self.read_to_contig_map )
    las_file = fn( self.las_file )
    pread_to_contig_file = fn( self.pread_to_contig_file )
    read_to_contig_map = fn( self.read_to_contig_map )
    
    pid_to_rid = open(pread_id_file).read().split("\n")
    rid_to_oid = open(rawread_id_file).read().split("\n")


    ovlp_data = []
    ovlp_count = 0
    longest_ovlp = 0
    a_id = None
    pid_to_contigs = {}
    
    with open(read_to_contig_map) as f:
        for row in f:
            row = row.strip().split()
            pid, rid, oid, ctg = row
            pid = int(pid)
            pid_to_contigs.setdefault( pid, (oid, set() ) )
            pid_to_contigs[ pid ][1].add( ctg )
            
    oid_to_phase = {}
    with open(phased_read_file) as f:
        for row in f:
            row = row.strip().split()
            ctg_id, block, phase = row[1:4]
            oid = row[6]
            block = int(block)
            phase = int(phase)
            oid_to_phase[ oid ] = (ctg_id, block, phase)

    with open(pread_to_contig_file, "w") as f:
        ovlp_data = {}
        cur_read_id = None
        skip_rest = 0
        for row in sp.check_output(shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file)) ).splitlines():

            row = row.strip().split()
            t_id = int(row[1])
            q_id = int(row[0])
            if q_id != cur_read_id:
                if cur_read_id == None:
                    cur_read_id = q_id
                else:
                    if len(ovlp_data) == 0:
                        rid = pid_to_rid[cur_read_id].split("/")[1]
                        rid = int(int(rid)/10)
                        o_id = rid_to_oid[ rid ]
                        print >>f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
                    else:
                        ovlp_v = ovlp_data.values()
                        ovlp_v.sort()
                        rank = 0
                        for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                            print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                            rank += 1
                    ovlp_data = {}
                    cur_read_id = q_id
                    skip_rest = 0

            if q_id in pid_to_contigs and len(ovlp_data) == 0: #if the query is in some contig....
                t_o_id, ctgs = pid_to_contigs[ q_id ]
                rid = pid_to_rid[q_id].split("/")[1]
                rid = int(int(rid)/10)
                o_id = rid_to_oid[ rid ]
                for ctg in list(ctgs):
                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
                    ovlp_data[ctg][0] = -int(row[7]) 
                    ovlp_data[ctg][1] += 1
                skip_rest = 1

            if skip_rest == 1:
                continue

            if t_id not in pid_to_contigs:
                continue
        
            q_rid = int( int(pid_to_rid[q_id].split("/")[1])/10 )
            q_phase = oid_to_phase.get( rid_to_oid[ q_rid ], None )
            
            if q_phase != None:
                ctg_id, block, phase = q_phase
                if block != -1:
                    t_rid = int( int(pid_to_rid[t_id].split("/")[1])/10 )
                    t_phase = oid_to_phase.get( rid_to_oid[ t_rid ], None )
                    if t_phase != None:
                        if t_phase[0] == ctg_id and t_phase[1] == block and t_phase[2] != phase:
                            continue

            t_o_id, ctgs = pid_to_contigs[ t_id ]
            rid = pid_to_rid[q_id].split("/")[1]
            rid = int(int(rid)/10)
            o_id = rid_to_oid[ rid ]
            
            for ctg in list(ctgs):
                ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
                ovlp_data[ctg][0] += int(row[2])
                ovlp_data[ctg][1] += 1

        if len(ovlp_data) != 0:
            ovlp_v = ovlp_data.values()
            ovlp_v.sort()
            rank = 0
            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                rank += 1
示例#45
0
def flow(config):
    #import pdb; pdb.set_trace()
    parameters = config
    #exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
    #wf.refreshTargets(exitOnFailure=exitOnFailure)
    #wf = PypeThreadWorkflow()
    #wf = PypeWorkflow()
    #wf = PypeWorkflow(job_type='local')
    log.debug('config=\n{}'.format(pprint.pformat(config)))
    # Set some defaults on the Workflow.
    concurrent_jobs = 24 # TODO: Configure this.
    wf = PypeWorkflow(
            job_type=config['hgap'].get('job_type'),
            job_queue=config['hgap'].get('job_queue'),
            watcher_type=config['hgap'].get('pwatcher_type', 'blocking'),
            #watcher_directory=config['pwatcher_directory'],
            max_jobs=config['hgap'].get('max_jobs', concurrent_jobs),
    )

    use_tmpdir = config['hgap'].get('use_tmpdir')
    if use_tmpdir:
        log.info('hgap.use_tmpdir={!r}'.format(use_tmpdir))
        if use_tmpdir is not True and '/' in use_tmpdir:
            tempfile.tempdir = use_tmpdir
            log.info('Using tempfile.tempdir={}'.format(tempfile.tempdir))
        else:
            log.info('Keeping tempfile.tempdir={}'.format(tempfile.tempdir))

    dataset_pfn = makePypeLocalFile(config['pbsmrtpipe']['input_files'][0])
    filtered_pfn = makePypeLocalFile('run-filterbam/filtered.subreadset.xml')
    make_task = PypeTask(
            inputs = {"dataset": dataset_pfn, },
            outputs = {"filtered": filtered_pfn, },
            parameters = parameters,
    )
    task = make_task(start_task.task_filterbam)
    wf.addTask(task)

    split_subreadsets_fofn_pfn = makePypeLocalFile('run-bam_scatter/chunked_subreadsets.fofn')
    make_task = PypeTask(
            inputs = {"dataset": filtered_pfn, },
            outputs =  {"split_subreadsets_fofn": split_subreadsets_fofn_pfn, },
            parameters = parameters,
    )
    task = make_task(start_task.task_bam_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    tasks, input_fofn_pfn = create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    fc_cfg_pfn = makePypeLocalFile('run-falcon/fc.cfg')
    fc_json_config_pfn = makePypeLocalFile('run-falcon/fc.json')
    make_task = PypeTask(
            inputs = {
                      "input_fofn": input_fofn_pfn,
            },
            outputs = {"fc_cfg": fc_cfg_pfn,
                       "fc_json_config": fc_json_config_pfn,
            },
            parameters = parameters,
    )
    task = make_task(start_task.task_prepare_falcon)
    wf.addTask(task)
    wf.refreshTargets()

    input_config_fn = fn(fc_cfg_pfn)
    with sys.cd('run-falcon'):
        falcon_kit.mains.run1.fc_run_logger = falcon_kit.run_support.logger = logging.getLogger('falcon')
        fc_cfg = falcon_kit.run_support.get_dict_from_old_falcon_cfg(
                falcon_kit.run_support.parse_config(input_config_fn))
        # FALCON takes over the workflow for a while.
        # (For debugging, it is still possible to restart just fc_run, if desired.)
        falcon_asm_done_pfn = falcon_kit.mains.run1.run(wf, fc_cfg,
                input_config_fn,
                input_fofn_plf=input_fofn_pfn, # _pfn should be _plf, but oh well
        )
        wf.max_jobs = concurrent_jobs # in case Falcon changed this

    # Here is a hard-linking task to help us attach falcon into the dependency graph.
    falcon_link_done_pfn = makePypeLocalFile('run-falcon_link/falcon_link_done')
    make_task = PypeTask(
            inputs = {"falcon_asm_done": falcon_asm_done_pfn,},
            outputs = {
                       "falcon_link_done": falcon_link_done_pfn,
            },
            parameters = parameters,
    )
    task = make_task(start_task.task_falcon_link)
    wf.addTask(task)

    # The rest of the workflow will operate on datasets, not fasta directly.
    referenceset_pfn = makePypeLocalFile('run-fasta2referenceset/asm.referenceset.xml')
    make_task = PypeTask(
            inputs =  {"falcon_link_done": falcon_link_done_pfn,},
            outputs = {"referenceset": referenceset_pfn,},
            parameters = parameters,
    )
    task = make_task(start_task.task_fasta2referenceset)
    wf.addTask(task)
    wf.refreshTargets()

    # scatter the subreads for pbalign
    """Produces:
    pbalign_chunk.json
    chunk_subreadset_*.subreadset.xml
    """
    pbalign_chunk_json_pfn = makePypeLocalFile('run-pbalign-scatter/pbalign_chunk.json')
    make_task = PypeTask(
            inputs = {"dataset": dataset_pfn,
                      "referenceset": referenceset_pfn,},
            outputs = {"out_json": pbalign_chunk_json_pfn,},
            parameters = parameters,
    )
    task = make_task(start_task.task_pbalign_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    # After scattering, we can specify the pbalign jobs.
    tasks, alignmentset_pfn = create_tasks_pbalign(pbalign_chunk_json_pfn, referenceset_pfn, parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    # scatter the alignmentset for genomic_consensus (variantCaller)
    """Produces:
    gc.chunks.fofn
    ???*.congitset.xml ???
    """
    gc_chunks_fofn_pfn = makePypeLocalFile('run-gc_scatter/gc.chunks.fofn')
    make_task = PypeTask(
            inputs = {"alignmentset": alignmentset_pfn,
                      "referenceset": referenceset_pfn,},
            outputs = {"out_fofn": gc_chunks_fofn_pfn,},
            parameters = parameters,
    )
    task = make_task(start_task.task_gc_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    tasks, contigset_pfn, gathered_fastq_pfn = create_tasks_gc(gc_chunks_fofn_pfn, referenceset_pfn, parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()


    # Final report

    polished_assembly_report_json_pfn = makePypeLocalFile('run-polished-assembly-report/polished_assembly_report.json')
    make_task = PypeTask(
            inputs = {"referenceset": referenceset_pfn,
                      "gathered_alignmentset": alignmentset_pfn,
                      "polished_fastq": gathered_fastq_pfn,},
            outputs = {"report_json": polished_assembly_report_json_pfn,},
            parameters = parameters,
    )
    task = make_task(start_task.task_polished_assembly_report)
    wf.addTask(task)

    wf.refreshTargets()

    par_dir = os.path.dirname(fn(polished_assembly_report_json_pfn))
    sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality.png'))
    sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality_thumb.png'))
    #return
    ##############

    if not os.path.exists('foo.bar1'):
        sys.system('touch foo.bar1')
    foo_fn1 = makePypeLocalFile('foo.bar1')
    foo_fn2 = makePypeLocalFile('foo.bar2')
    make_task = PypeTask(
            inputs = {"foo1": foo_fn1,},
            outputs =  {"foo2": foo_fn2,},
            parameters = parameters,
    )
    task = make_task(start_task.task_foo)
    wf.addTask(task)
    wf.refreshTargets()
示例#46
0
def run(wf, config, rule_writer,
        config_fn,
        input_fofn_plf,
        ):
    """
    Preconditions (for now):
    * LOG
    * run_support.logger
    """
    parsed_config = io.deserialize(config_fn)
    if parsed_config != config:
        msg = 'Config from {!r} != passed config'.format(config_fn)
        raise Exception(msg)
    general_config = config['General']
    general_config_fn = os.path.join(os.path.dirname(config_fn), 'General_config.json')
    io.serialize(general_config_fn, general_config) # Some tasks use this.
    rawread_dir = '0-rawreads'
    pread_dir = '1-preads_ovl'
    falcon_asm_dir = '2-asm-falcon'

    for d in (rawread_dir, pread_dir, falcon_asm_dir):
        support.make_dirs(d)

    # only matter for parallel jobs
    job_defaults = config['job.defaults']
    exitOnFailure = bool(job_defaults.get('stop_all_jobs_on_failure', False))
    default_njobs = int(job_defaults.get('njobs', 7))
    wf.max_jobs = default_njobs

    assert general_config['input_type'] in (
        'raw', 'preads'), 'Invalid input_type=={!r}'.format(general_config['input_type'])

    # Store config as JSON, available to many tasks.

    if general_config['input_type'] == 'raw':
        parameters = {}

        # import sequences into daligner DB
        # calculate length_cutoff (if specified as -1)
        # split DB
        # run DBdust
        r_db_dust_fn = os.path.join(rawread_dir, 'build', 'raw_reads.db')
        length_cutoff_fn = os.path.join(rawread_dir, 'build', 'length_cutoff')
        wf.addTask(gen_task(
            script=pype_tasks.TASK_DB_BUILD_SCRIPT,
            inputs={
                'config': general_config_fn,
                'input_fofn': fn(input_fofn_plf),
            },
            outputs={
                'length_cutoff': length_cutoff_fn,
                'db': r_db_dust_fn,
                # Also .raw_reads.*, of course. And dust track.
            },
            parameters=dict(
            ),
            rule_writer=rule_writer,
            dist=Dist(NPROC=1),
        ))

        # run TANmask
        tan_uows_fn = os.path.join(
            rawread_dir, 'tan-split', 'tan-uows.json')
        tan_bash_template_fn = os.path.join(
            rawread_dir, 'tan-split', 'bash_template.sh')
        wf.addTask(gen_task(
            script=pype_tasks.TASK_DB_TAN_SPLIT_SCRIPT,
            inputs={
                'config': general_config_fn,
                'db': r_db_dust_fn,
            },
            outputs={
                'split': tan_uows_fn,
                'bash_template': tan_bash_template_fn,
            },
            parameters={},
            rule_writer=rule_writer,
            dist=Dist(NPROC=1),
        ))

        gathered_fn = os.path.join(rawread_dir, 'tan-gathered', 'gathered-done-files.json')
        gen_parallel_tasks(
            wf, rule_writer,
            tan_uows_fn, gathered_fn,
            run_dict=dict(
                bash_template_fn=tan_bash_template_fn,
                script='fubar-TODO', #pype_tasks.TASK_DB_TAN_APPLY_SCRIPT, # for snakemake stuff
                inputs={
                    'units_of_work': '0-rawreads/tan-chunks/{tan0_id}/some-units-of-work.json',
                },
                outputs={
                    #'job_done': '0-rawreads/{dal0_id}/daligner.done',
                    'results': '0-rawreads/tan-runs/{tan0_id}/some-done-files.json',
                },
                parameters={},

            ),
            dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.da']),
        )

        r_db_tan_fn = os.path.join(rawread_dir, 'tan-combine', 'raw_reads.db')
        wf.addTask(gen_task(
            script=pype_tasks.TASK_DB_TAN_COMBINE_SCRIPT,
            inputs={
                'config': general_config_fn,
                'db': r_db_dust_fn,
                'gathered': gathered_fn,
            },
            outputs={
                'new_db': r_db_tan_fn,
            },
            parameters={},
            rule_writer=rule_writer,
            dist=Dist(local=True),
        ))

        # run daligner
        wf.max_jobs = config['job.step.da'].get('njobs', default_njobs)
        #rawreads_db_fn = os.path.join(rawread_dir, 'raw_reads.db')
        daligner_all_units_fn = os.path.join(
            rawread_dir, 'daligner-split', 'all-units-of-work.json')
        daligner_bash_template_fn = os.path.join(
            rawread_dir, 'daligner-split', 'daligner_bash_template.sh')
        params = dict(parameters)
        #params['db_prefix'] = 'raw_reads'
        #params['pread_aln'] = 0
        params['skip_checks'] = int(general_config.get('skip_checks', 0))
        params['wildcards'] = 'dal0_id'
        wf.addTask(gen_task(
            script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT,
            inputs={
                'config': general_config_fn,
                'db': r_db_tan_fn,
                'length_cutoff': length_cutoff_fn,
            },
            outputs={
                'split': daligner_all_units_fn,
                'bash_template': daligner_bash_template_fn
            },
            parameters=params,
            rule_writer=rule_writer,
            dist=Dist(local=True, NPROC=4), # really, NPROC=1, but we need to know the max
        ))

        gathered_fn = os.path.join(rawread_dir, 'daligner-gathered', 'gathered-done-files.json')
        gen_parallel_tasks(
            wf, rule_writer,
            daligner_all_units_fn, gathered_fn,
            run_dict=dict(
                bash_template_fn=daligner_bash_template_fn,
                script=pype_tasks.TASK_DB_DALIGNER_APPLY_SCRIPT, # for snakemake stuff
                inputs={
                    'units_of_work': os.path.join(rawread_dir, 'daligner-chunks/{dal0_id}/some-units-of-work.json'),
                },
                outputs={
                    'results': os.path.join(rawread_dir, 'daligner-runs/{dal0_id}/some-done-files.json'),
                },
                parameters={},
            ),
            dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.da']),
        )

        r_gathered_las_fn = os.path.join(rawread_dir, 'daligner-combine', 'gathered-las.json')
        wf.addTask(gen_task(
            script=pype_tasks.TASK_DB_DALIGNER_COMBINE_SCRIPT,
            inputs={
                'config': general_config_fn,
                'db': r_db_tan_fn,
                'gathered': gathered_fn,
            },
            outputs={
                'las_paths': r_gathered_las_fn,
            },
            parameters={},
            rule_writer=rule_writer,
            #dist=Dist(NPROC=1, MB=4000, job_dict=config['job.step.da'])
            dist=Dist(local=True),
        ))

        # Merge .las files.
        wf.max_jobs = config['job.step.la'].get('njobs', default_njobs)
        las_merge_all_units_fn = os.path.join(rawread_dir, 'las-merge-split', 'all-units-of-work.json')
        bash_template_fn = os.path.join(rawread_dir, 'las-merge-split', 'las-merge-bash-template.sh')
        params = dict(parameters)
        params['db_prefix'] = 'raw_reads'
        params['wildcards'] = 'mer0_id'
        wf.addTask(gen_task(
            script=pype_tasks.TASK_DB_LAMERGE_SPLIT_SCRIPT,
            inputs={
                'config': general_config_fn,
                'las_paths': r_gathered_las_fn,
            },
            outputs={
                'split': las_merge_all_units_fn,
                'bash_template': bash_template_fn,
            },
            parameters=params,
            rule_writer=rule_writer,
            dist=Dist(local=True),
        ))

        gathered_fn = os.path.join(rawread_dir, 'las-merge-gathered', 'gathered.json')
        gen_parallel_tasks(
            wf, rule_writer,
            las_merge_all_units_fn, gathered_fn,
            run_dict=dict(
                bash_template_fn=bash_template_fn,
                script=pype_tasks.TASK_DB_LAMERGE_APPLY_SCRIPT, # for snakemake
                inputs={
                    #'las_paths': './0-rawreads/merge-scripts/{mer0_id}/las_paths.json',
                    #'merge_script': './0-rawreads/merge-scripts/{mer0_id}/merge-script.sh',
                    #'merged_las_json': './0-rawreads/merge-scripts/{mer0_id}/merged_las.json',
                    'units_of_work': '0-rawreads/las-merge-chunks/{mer0_id}/some-units-of-work.json',
                },
                outputs={
                    #'merged_las': './0-rawreads/{mer0_id}/merged.las',
                    #'job_done': './0-rawreads/{mer0_id}/merge.done',
                    'results': '0-rawreads/las-merge-runs/{mer0_id}/some-las-paths.json',
                },
                parameters={},
            ),
            dist=Dist(NPROC=1, job_dict=config['job.step.la']),
        )

        p_id2las_fn = os.path.join(rawread_dir, 'las-merge-combine', 'p_id2las.json')
        las_fofn_fn = os.path.join(rawread_dir, 'las-merge-combine', 'las_fofn.json')
        wf.addTask(gen_task(
            script=pype_tasks.TASK_DB_LAMERGE_COMBINE_SCRIPT,
            inputs={
                'config': general_config_fn,
                'gathered': gathered_fn,
            },
            outputs={
                'block2las': p_id2las_fn,
                'las_paths': las_fofn_fn,
            },
            parameters={},
            rule_writer=rule_writer,
            dist=Dist(local=True),
        ))

        if general_config['target'] == 'overlapping':
            sys.exit(0)

        # Produce new FOFN of preads fasta, based on consensus of overlaps.
        wf.max_jobs = config['job.step.cns'].get('njobs', default_njobs)

        split_fn = os.path.join(
            rawread_dir, 'cns-split', 'split.json')
        bash_template_fn = os.path.join(
            rawread_dir, 'cns-split', 'consensus-bash-template.sh')
        params = dict(parameters)
        params['wildcards'] = 'cns0_id,cns0_id2'
        wf.addTask(gen_task(
            script=pype_tasks.TASK_CONSENSUS_SPLIT_SCRIPT,
            inputs={
                'p_id2las': p_id2las_fn,
                'raw_reads_db': r_db_tan_fn,
                'length_cutoff': length_cutoff_fn,
                'config': general_config_fn,
            },
            outputs={
                'split': split_fn,
                'bash_template': bash_template_fn,
            },
            parameters=params,
            rule_writer=rule_writer,
            dist=Dist(local=True),
        ))

        gathered_fn = os.path.join(rawread_dir, 'cns-gather', 'gathered.json')
        gen_parallel_tasks(
            wf, rule_writer,
            split_fn, gathered_fn,
            run_dict=dict(
                bash_template_fn=bash_template_fn,
                script=pype_tasks.TASK_CONSENSUS_TASK_SCRIPT, # for snakemake only
                inputs = {
                    #'las': '0-rawreads/cns-split/{cns0_id}/merged.{cns0_id2}.las',
                    #'db': r_db_tan_fn,
                    #'length_cutoff': length_cutoff_fn,
                    #'config': general_config_fn,
                    'units_of_work': '0-rawreads/cns-chunks/{cns0_id}/some-units-of-work.json',
                },
                outputs = {
                    #'fasta': '0-rawreads/consensus/{cns0_id}/consensus.{cns0_id2}.fasta',
                    'results': '0-rawreads/cns-runs/{cns0_id}/some-done-files.json',
                },
                parameters={},
            ),
            dist=Dist(NPROC=6, job_dict=config['job.step.cns']),
        )
        preads_fofn_fn = os.path.join(rawread_dir, 'preads', 'input_preads.fofn')
        wf.addTask(gen_task(
            script=pype_tasks.TASK_CONSENSUS_GATHER_SCRIPT,
            inputs={
                'gathered': gathered_fn,
            },
            outputs={
                'preads_fofn': preads_fofn_fn,
            },
            parameters=parameters, #{},
            rule_writer=rule_writer,
            dist=Dist(local=True),
        ))

        rdir = os.path.join(rawread_dir, 'report')
        pre_assembly_report_fn = os.path.join(rdir, 'pre_assembly_stats.json')
        params = dict(parameters)
        params['length_cutoff_user'] = general_config['length_cutoff']
        params['genome_length'] = general_config['genome_size'] # note different name; historical
        wf.addTask(gen_task(
            script=pype_tasks.TASK_REPORT_PRE_ASSEMBLY_SCRIPT,
            inputs={'length_cutoff': length_cutoff_fn,
                    'raw_reads_db': r_db_tan_fn,
                    'preads_fofn': preads_fofn_fn,
                    'config': general_config_fn,
            },
            outputs={'pre_assembly_report': pre_assembly_report_fn,
            },
            parameters=params,
            rule_writer=rule_writer,
            dist=Dist(local=True),
        ))

    if general_config['target'] == 'pre-assembly':
        LOG.info('Quitting after stage-0 for "pre-assembly" target.')
        sys.exit(0)

    # build pread database
    if general_config['input_type'] == 'preads':
        """
        preads_fofn_plf = makePypeLocalFile(os.path.join(
            pread_dir, 'preads-fofn-abs', os.path.basename(general_config['input_fofn'])))
        make_fofn_abs_task = PypeTask(inputs={'i_fofn': input_fofn_plf},
                                      outputs={'o_fofn': preads_fofn_plf},
                                      parameters={},
                                      )
        fofn_abs_task = make_fofn_abs_task(
            pype_tasks.task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])
        """
        raise Exception('TODO')

    pdb_build_done = os.path.join(pread_dir, 'pdb_build_done')
    run_jobs_fn = os.path.join(pread_dir, 'run_jobs.sh')
    preads_db_fn = os.path.join(pread_dir, 'build', 'preads.db')
    length_cutoff_pr_fn = os.path.join(pread_dir, 'build', 'length_cutoff')

    wf.addTask(gen_task(
        script=pype_tasks.TASK_DB_BUILD_SCRIPT,
        inputs={
            'config': general_config_fn,
            'input_fofn': preads_fofn_fn,
        },
        outputs={
            'length_cutoff': length_cutoff_pr_fn,
            'db': preads_db_fn,
            # Also .preads.*, of course.
        },
        parameters=dict(
        ),
        rule_writer=rule_writer,
        dist=Dist(NPROC=1),
    ))

    # run daligner
    wf.max_jobs = config['job.step.pda'].get('njobs', default_njobs)
    daligner_all_units_fn = os.path.join(
        pread_dir, 'daligner-split', 'all-units-of-work.json')
    daligner_bash_template_fn = os.path.join(
        pread_dir, 'daligner-split', 'daligner_bash_template.sh')
    params = dict(parameters)
    params['skip_checks'] = int(general_config.get('skip_checks', 0))
    params['wildcards'] = 'dal1_id'
    wf.addTask(gen_task(
        script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT,
        inputs={
            'config': general_config_fn,
            'db': preads_db_fn, #not tan, yet
            'length_cutoff': length_cutoff_pr_fn,
        },
        outputs={
            'split': daligner_all_units_fn,
            'bash_template': daligner_bash_template_fn
        },
        parameters=params,
        rule_writer=rule_writer,
        dist=Dist(local=True, NPROC=4), # really, NPROC=1, but we need to know the max
    ))

    gathered_fn = os.path.join(pread_dir, 'daligner-gathered', 'gathered-done-files.json')
    gen_parallel_tasks(
        wf, rule_writer,
        daligner_all_units_fn, gathered_fn,
        run_dict=dict(
            bash_template_fn=daligner_bash_template_fn,
            script=pype_tasks.TASK_DB_DALIGNER_APPLY_SCRIPT, # for snakemake stuff
            inputs={
                'units_of_work': os.path.join(pread_dir, 'daligner-chunks/{dal1_id}/some-units-of-work.json'),
            },
            outputs={
                'results': os.path.join(pread_dir, 'daligner-runs/{dal1_id}/some-done-files.json'),
            },
            parameters={},
        ),
        dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.pda']),
    )

    gathered_las_fn = os.path.join(pread_dir, 'daligner-combine', 'gathered-las.json')
    wf.addTask(gen_task(
        script=pype_tasks.TASK_DB_DALIGNER_COMBINE_SCRIPT,
        inputs={
            'config': general_config_fn,
            'db': preads_db_fn, #r_db_tan_fn,
            'gathered': gathered_fn,
        },
        outputs={
            'las_paths': gathered_las_fn,
        },
        parameters={},
        rule_writer=rule_writer,
        #dist=Dist(NPROC=1, MB=4000, job_dict=config['job.step.pda'])
        dist=Dist(local=True),
    ))

    # Merge .las files.
    wf.max_jobs = config['job.step.pla'].get('njobs', default_njobs)
    las_merge_all_units_fn = os.path.join(pread_dir, 'las-merge-split', 'all-units-of-work.json')
    bash_template_fn = os.path.join(pread_dir, 'las-merge-split', 'las-merge-bash-template.sh')
    params = dict(parameters)
    params['db_prefix'] = 'preads'
    params['wildcards'] = 'mer1_id'
    wf.addTask(gen_task(
        script=pype_tasks.TASK_DB_LAMERGE_SPLIT_SCRIPT,
        inputs={
            'config': general_config_fn,
            'las_paths': gathered_las_fn,
        },
        outputs={
            'split': las_merge_all_units_fn,
            'bash_template': bash_template_fn,
        },
        parameters=params,
        rule_writer=rule_writer,
        dist=Dist(local=True),
    ))

    gathered_fn = os.path.join(pread_dir, 'las-merge-gathered', 'gathered.json')
    gen_parallel_tasks(
        wf, rule_writer,
        las_merge_all_units_fn, gathered_fn,
        run_dict=dict(
            bash_template_fn=bash_template_fn,
            script=pype_tasks.TASK_DB_LAMERGE_APPLY_SCRIPT, # for snakemake
            inputs={
                'units_of_work': os.path.join(pread_dir, 'las-merge-chunks/{mer0_id}/some-units-of-work.json'),
            },
            outputs={
                'results': os.path.join(pread_dir, 'las-merge-runs/{mer0_id}/some-las-paths.json'),
            },
            parameters={},
        ),
        dist=Dist(NPROC=1, job_dict=config['job.step.la']),
    )

    p_id2las_fn = os.path.join(pread_dir, 'las-merge-combine', 'block2las.json')
    las_fofn_fn = os.path.join(pread_dir, 'las-merge-combine', 'las_fofn.json')
    wf.addTask(gen_task(
        script=pype_tasks.TASK_DB_LAMERGE_COMBINE_SCRIPT,
        inputs={
            'config': general_config_fn,
            'gathered': gathered_fn,
        },
        outputs={
            'block2las': p_id2las_fn,
            'las_paths': las_fofn_fn,
        },
        parameters={},
        rule_writer=rule_writer,
        dist=Dist(local=True),
    ))

    wf.max_jobs = config['job.step.asm'].get('njobs', default_njobs)
    db2falcon_dir = os.path.join(pread_dir, 'db2falcon')
    db2falcon_done_fn = os.path.join(db2falcon_dir, 'db2falcon_done')
    preads4falcon_fn = os.path.join(db2falcon_dir, 'preads4falcon.fasta')
    wf.addTask(gen_task(
        script=pype_tasks.TASK_RUN_DB_TO_FALCON_SCRIPT,
        inputs={'p_id2las': p_id2las_fn,
                'preads_db': preads_db_fn,
                },
        outputs={'job_done': db2falcon_done_fn,
                 'preads4falcon': preads4falcon_fn,
                 },
        parameters={},
        rule_writer=rule_writer,
        dist=Dist(NPROC=4, job_dict=config['job.step.asm']),
    ))

    falcon_asm_done_fn = os.path.join(falcon_asm_dir, 'falcon_asm_done')
    for key in ('overlap_filtering_setting', 'length_cutoff_pr', 'fc_ovlp_to_graph_option'):
        parameters[key] = general_config[key]
    wf.addTask(gen_task(
        script=pype_tasks.TASK_RUN_FALCON_ASM_SCRIPT,
        inputs={'db2falcon_done': db2falcon_done_fn, 'db_file': preads_db_fn,
                'preads4falcon_fasta': preads4falcon_fn,
                'las_fofn': las_fofn_fn,
                'config': general_config_fn,
                },
        outputs={'falcon_asm_done': falcon_asm_done_fn},
        parameters=parameters,
        rule_writer=rule_writer,
        dist=Dist(NPROC=4, job_dict=config['job.step.asm']),
    ))
    wf.refreshTargets()

    with io.cd('0-rawreads'):
        # for backwards-compatibility
        io.symlink('las-merge-combine', 'las-gather')
def dump_pread_ids(self):
    pread_db = fn( self.pread_db )
    pread_id_file = fn( self.pread_id_file )
    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file) )
示例#48
0
def create_tasks_gc(fofn_pfn, referenceset_pfn, parameters):
    """Create a gc task for each chunk, plus a gathering task.
    Here is the convoluted workflow:
    1. For each gc instance "chunk":
      A. variantCaller writes .fasta
      B. We create a contigset for the .fasta
    2. We keep the contigset output filenames in a FOFN (from run_gc_scatter)
       and pass that to run_gc_gather().
    3. We read each contigset and add them to a gathered ContigSet.
    4. We "consolidate" their underlying .fasta "resources",
       assuming their filenames match except extenion.
    5. Finally, we write the gathered contigset.
    Whew!
    We also gather fastq here, for convenience.
    """
    tasks = list()
    contigsets = dict()
    fastqs = dict()
    # Assume fofn of gc chunks are all relative to the dir of the fofn.
    for i, alignmentset_bn in enumerate(open(fn(fofn_pfn)).read().split()):
        alignmentset_fn = os.path.join(os.path.dirname(fn(fofn_pfn)),
                                       alignmentset_bn)
        wdir = 'run-gc-{:02}'.format(i)
        mkdirs(wdir)  # Assume CWD is correct.
        alignmentset_pfn = makePypeLocalFile(
            alignmentset_fn)  # New pfn cuz it was not pfn before.
        polished_fastq_pfn = makePypeLocalFile(
            os.path.join(wdir, 'consensus.fastq'))
        variants_gff_pfn = makePypeLocalFile(os.path.join(
            wdir, 'variants.gff'))
        consensus_contigset_pfn = makePypeLocalFile(
            os.path.join(wdir, 'consensus.contigset.xml'))
        """Also produces:
        consensus.fasta
        consensus.fasta.fai

        And note that these files names are important, as pbcoretools gathering expects
        a particular pattern.
        """
        contigsets['contigset_{:02d}'.format(i)] = consensus_contigset_pfn
        fastqs['fastq_{:02d}'.format(i)] = polished_fastq_pfn
        make_task = PypeTask(
            inputs={
                "alignmentset": alignmentset_pfn,
                "referenceset": referenceset_pfn,
            },
            outputs={
                "polished_fastq": polished_fastq_pfn,
                "variants_gff": variants_gff_pfn,
                "consensus_contigset": consensus_contigset_pfn,
            },
            parameters=parameters,
        )
        task = make_task(start_task.task_genomic_consensus)
        tasks.append(task)
    contigset_pfn = makePypeLocalFile('run-gc-gather/contigset.xml')
    gathered_fastq_pfn = makePypeLocalFile('run-gc-gather/gathered.fastq')
    inputs = dict(contigsets)
    inputs.update(fastqs)
    log.debug('inputs to gc_gather:{}'.format(pprint.pformat(contigsets)))
    make_task = PypeTask(
        inputs=inputs,
        outputs={
            "ds_out": contigset_pfn,
            "fastq_out": gathered_fastq_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_gc_gather)
    tasks.append(task)
    return tasks, contigset_pfn, gathered_fastq_pfn
示例#49
0
def main(argv=sys.argv):
    global LOG
    LOG = support.setup_logger(None)

    if len(sys.argv) < 2:
        print >> sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
        sys.exit(1)

    config_fn = sys.argv[1]
    config_absbasedir = os.path.dirname(os.path.abspath(config_fn))

    config = ConfigParser.ConfigParser()
    config.read(config_fn)

    job_type = 'SGE'
    if config.has_option('General', 'job_type'):
        job_type = config.get('General', 'job_type')

    sge_track_reads = ' -pe smp 12 -q bigmem'
    if config.has_option('Unzip', 'sge_track_reads'):
        sge_track_reads = config.get('Unzip', 'sge_track_reads')

    sge_quiver = ' -pe smp 24 -q bigmem '
    if config.has_option('Unzip', 'sge_quiver'):
        sge_quiver = config.get('Unzip', 'sge_quiver')

    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
    if config.has_option('Unzip', 'smrt_bin'):
        smrt_bin = config.get('Unzip', 'smrt_bin')

    input_bam_fofn = 'input_bam.fofn'
    if config.has_option('Unzip', 'input_bam_fofn'):
        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
    if not os.path.isabs(input_bam_fofn):
        input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)

    quiver_concurrent_jobs = 8
    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
        quiver_concurrent_jobs = config.getint('Unzip',
                                               'quiver_concurrent_jobs')

    config = {
        'job_type': job_type,
        'sge_quiver': sge_quiver,
        'sge_track_reads': sge_track_reads,
        'input_bam_fofn': input_bam_fofn,
        'smrt_bin': smrt_bin
    }
    LOG.info('config={}'.format(pprint.pformat(config)))

    #support.job_type = 'SGE' #tmp hack until we have a configuration parser

    wf = PypeProcWatcherWorkflow(max_jobs=quiver_concurrent_jobs, )

    abscwd = os.path.abspath('.')
    parameters = {
        'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'),
        'config': config
    }
    hasm_done_plf = makePypeLocalFile(
        './3-unzip/1-hasm/hasm_done')  # by convention
    track_reads_h_done_plf = makePypeLocalFile(
        os.path.join(parameters['wd'], 'track_reads_h_done'))
    make_track_reads_task = PypeTask(
        inputs={'hasm_done': hasm_done_plf},
        outputs={'job_done': track_reads_h_done_plf},
        parameters=parameters,
    )
    track_reads_task = make_track_reads_task(task_track_reads)
    #sge_track_reads = config['sge_track_reads']

    wf.addTask(track_reads_task)

    scattered_quiver_plf = makePypeLocalFile(
        '4-quiver/quiver_scatter/scattered.json')
    make_task = PypeTask(
        inputs={
            'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
            'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
            'track_reads_h_done': track_reads_h_done_plf,
        },
        outputs={
            'scattered_quiver_json': scattered_quiver_plf,
        },
        parameters={},
    )
    wf.addTask(make_task(task_scatter_quiver))
    wf.refreshTargets()

    p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(
        scattered_quiver_plf)

    gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
    gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
    gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
    mkdir('4-quiver/cns_gather')
    with open(fn(gathered_p_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
    with open(fn(gathered_h_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))

    make_task = PypeTask(
        inputs=job_done_plfs,
        outputs={
            'job_done': gather_done_plf,
        },
        parameters={},
    )
    wf.addTask(make_task(task_gather_quiver))
    wf.refreshTargets()

    cns_p_ctg_fasta_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_p_ctg.fasta')
    cns_p_ctg_fastq_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_p_ctg.fastq')
    cns_h_ctg_fasta_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_h_ctg.fasta')
    cns_h_ctg_fastq_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_h_ctg.fastq')
    zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
    make_task = PypeTask(
        inputs={
            'gathered_p_ctg': gathered_p_ctg_plf,
            'gathered_h_ctg': gathered_h_ctg_plf,
            'gather_done': gather_done_plf,
        },
        outputs={
            'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
            'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
            'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
            'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
            'job_done': zcat_done_plf,
        },
    )
    wf.addTask(make_task(task_cns_zcat))

    wf.refreshTargets()
def main(argv=sys.argv):
    global LOG
    LOG = support.setup_logger(None)


    if len(sys.argv) < 2:
        print>>sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
        sys.exit(1)

    config_fn = sys.argv[1]
    config_absbasedir = os.path.dirname(os.path.abspath(config_fn))

    config = ConfigParser.ConfigParser()
    config.read(config_fn)


    job_type = 'SGE'
    if config.has_option('General', 'job_type'):
        job_type = config.get('General', 'job_type')

    job_queue = 'default'
    if config.has_option('General', 'job_queue'):
        job_queue = config.get('General', 'job_queue')

    pwatcher_type = 'fs_based'
    if config.has_option('General', 'pwatcher_type'):
        pwatcher_type = config.get('General', 'pwatcher_type')

    sge_track_reads = ' -pe smp 12 -q bigmem'
    if config.has_option('Unzip', 'sge_track_reads'):
        sge_track_reads = config.get('Unzip', 'sge_track_reads')

    sge_quiver = ' -pe smp 24 -q bigmem '
    if config.has_option('Unzip', 'sge_quiver'):
        sge_quiver = config.get('Unzip', 'sge_quiver')

    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
    if config.has_option('Unzip', 'smrt_bin'):
        smrt_bin = config.get('Unzip', 'smrt_bin')

    input_bam_fofn = 'input_bam.fofn'
    if config.has_option('Unzip', 'input_bam_fofn'):
        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
    if not os.path.isabs(input_bam_fofn):
        input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)


    quiver_concurrent_jobs = 8
    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
        quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs')

    config = {'job_type': job_type,
              'job_queue': job_queue,
              'sge_quiver': sge_quiver,
              'sge_track_reads': sge_track_reads,
              'input_bam_fofn': input_bam_fofn,
              'pwatcher_type': pwatcher_type,
              'smrt_bin': smrt_bin}
    LOG.info('config={}'.format(pprint.pformat(config)))

    #support.job_type = 'SGE' #tmp hack until we have a configuration parser


    wf = PypeProcWatcherWorkflow(
            max_jobs=quiver_concurrent_jobs,
            job_type=config['job_type'],
            job_queue=config.get('job_queue'),
            sge_option=config.get('sge_option'),
            watcher_type=config.get('pwatcher_type'),
            #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'),
            use_tmpdir=config.get('use_tmpdir'),
    )

    abscwd = os.path.abspath('.')
    parameters = {
            'sge_option': config['sge_track_reads'],
    }
    input_bam_fofn_fn = config['input_bam_fofn']
    input_bam_fofn_plf = makePypeLocalFile(input_bam_fofn_fn)
    hasm_done_plf = makePypeLocalFile('./3-unzip/1-hasm/hasm_done') # by convention
    track_reads_h_done_plf = makePypeLocalFile('./4-quiver/reads/track_reads_h_done')
    make_track_reads_task = PypeTask(inputs = {
                                       'input_bam_fofn': input_bam_fofn_plf,
                                       'hasm_done': hasm_done_plf},
                                     outputs = {'job_done': track_reads_h_done_plf},
                                     parameters = parameters,
    )
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)

    scattered_quiver_plf = makePypeLocalFile('4-quiver/quiver_scatter/scattered.json')
    parameters = {
            'config': config,
    }
    make_task = PypeTask(
            inputs = {
                'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
                'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
                'track_reads_h_done': track_reads_h_done_plf,
            },
            outputs = {
                'scattered_quiver_json': scattered_quiver_plf,
            },
            parameters = parameters,
    )
    wf.addTask(make_task(task_scatter_quiver))
    wf.refreshTargets()

    p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(wf, scattered_quiver_plf)

    gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
    gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
    gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
    mkdir('4-quiver/cns_gather')
    with open(fn(gathered_p_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
    with open(fn(gathered_h_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))

    make_task = PypeTask(
            inputs = job_done_plfs,
            outputs = {
                'job_done': gather_done_plf,
            },
            parameters = {},
    )
    wf.addTask(make_task(task_gather_quiver))
    wf.refreshTargets()

    cns_p_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fasta')
    cns_p_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fastq')
    cns_h_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fasta')
    cns_h_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fastq')
    zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
    make_task = PypeTask(
            inputs = {
                'gathered_p_ctg': gathered_p_ctg_plf,
                'gathered_h_ctg': gathered_h_ctg_plf,
                'gather_done': gather_done_plf,
            },
            outputs = {
                'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
                'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
                'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
                'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
                'job_done': zcat_done_plf,
            },
    )
    wf.addTask(make_task(task_cns_zcat))

    wf.refreshTargets()
                ovlp_data[ctg][1] += 1

        if len(ovlp_data) != 0:
            ovlp_v = ovlp_data.values()
            ovlp_v.sort()
            rank = 0
            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                rank += 1


phased_reads =  makePypeLocalFile(os.path.join(asm_dir, "all_phased_reads"))


for las_key, las_file in all_raw_las_files.items():
    las_fn = fn(las_file)
    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
    idx = int(idx.split(".")[1]) 
    rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx))
    make_dump_rawread_to_ctg = PypeTask( inputs = { "las_file": las_file, 
                                                    "rawread_db": rawread_db, 
                                                    "read_to_contig_map": read_to_contig_map, 
                                                    "rawread_id_file": rawread_id_file,
                                                    "pread_id_file": pread_id_file,
                                                    "phased_reads" : phased_reads},
                                      outputs = { "rawread_to_contig_file": rawread_to_contig_file },
                                      TaskType = PypeThreadTaskBase,
                                      URL = "task://localhost/r_read_to_contigs.%s" % idx )
    dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg)                           
    wf.addTask( dump_rawread_to_ctg_task )
示例#52
0
def dump_rawread_ids(self):
    rawread_db = fn(self.rawread_db)
    rawread_id_file = fn(self.rawread_id_file)
    os.system(
        "DBshow -n %s | tr -d '>' | LD_LIBRARY_PATH= awk '{print $1}' > %s" %
        (rawread_db, rawread_id_file))
示例#53
0
def run(wf, config,
        input_config_fn,
        input_fofn_plf,
        ):
    """
    Preconditions (for now):
    * fc_run_logger
    * run_support.logger
    """
    rawread_dir = os.path.abspath('./0-rawreads')
    pread_dir = os.path.abspath('./1-preads_ovl')
    falcon_asm_dir  = os.path.abspath('./2-asm-falcon')
    script_dir = os.path.abspath('./scripts')
    sge_log_dir = os.path.abspath('./sge_log')

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
    concurrent_jobs = config['pa_concurrent_jobs']
    wf.max_jobs = concurrent_jobs

    rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, 'raw-fofn-abs', os.path.basename(config['input_fofn'])))
    make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf},
                                  outputs = {'o_fofn': rawread_fofn_plf},
                                  parameters = {},
    )
    fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config['input_type'] == 'raw':
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile( os.path.join( rawread_dir, 'sleep_done') )
        rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, 'rdb_build_done') )
        run_jobs = makePypeLocalFile( os.path.join( rawread_dir, 'run_jobs.sh') )
        parameters = {'work_dir': rawread_dir,
                      'sge_option': config['sge_option_da'],
                      'config_fn': input_config_fn,
                      'config': config}

        length_cutoff_plf = makePypeLocalFile(os.path.join(rawread_dir, 'length_cutoff'))
        raw_reads_db_plf = makePypeLocalFile(os.path.join(rawread_dir, '%s.db' % 'raw_reads'))
        make_build_rdb_task = PypeTask(inputs = {'input_fofn': rawread_fofn_plf},
                                      outputs = {'rdb_build_done': rdb_build_done,
                                                 'raw_reads_db': raw_reads_db_plf,
                                                 'length_cutoff': length_cutoff_plf,
                                                 'run_jobs': run_jobs,
                                      },
                                      parameters = parameters,
        )
        build_rdb_task = make_build_rdb_task(pype_tasks.task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done])

        raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf))
        #### run daligner
        scattered_plf = os.path.join(rawread_dir, 'daligner-scatter', 'scattered.json')
        make_daligner_scatter = PypeTask(
                inputs = {
                    'run_jobs_fn': run_jobs,
                    'db_build_done': rdb_build_done,
                },
                outputs = {
                    'scatter_fn': scattered_plf,
                },
                parameters = {
                    'db_prefix': 'raw_reads',
                    'nblock': raw_reads_nblock,
                    'pread_aln': False,
                    'config': config,
                },
        )
        task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        daligner_tasks, daligner_out = create_daligner_tasks(rawread_dir, scattered_plf)

        wf.addTasks(daligner_tasks)
        r_gathered_las_plf = makePypeLocalFile(os.path.join(rawread_dir, 'raw-gather', 'gathered_las.txt'))

        parameters =  {
                'nblock': raw_reads_nblock,
        }
        make_daligner_gather = PypeTask(
                   inputs = daligner_out,
                   outputs =  {'gathered': r_gathered_las_plf},
                   parameters = parameters,
        )
        check_r_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
        wf.addTask(check_r_da_task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        # Merge .las files.
        scattered_plf = os.path.join(rawread_dir, 'merge-scatter', 'scattered.json')
        make_task = PypeTask(
                inputs = {
                    'run_jobs': run_jobs,
                    'gathered_las': r_gathered_las_plf,
                },
                outputs = {
                    'scattered': scattered_plf,
                },
                parameters = {
                    'db_prefix': 'raw_reads',
                    'config': config,
                },
        )
        task = make_task(pype_tasks.task_merge_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        merge_tasks, p_ids_merged_las = create_merge_tasks(rawread_dir, scattered_plf)
        wf.addTasks(merge_tasks)
        task, _, las_fopfn_plf = create_merge_gather_task(os.path.join(rawread_dir, 'merge-gather'), p_ids_merged_las)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        if config['target'] == 'overlapping':
            sys.exit(0)

        # Produce new FOFN of preads fasta, based on consensus of overlaps.
        scattered_plf = os.path.join(rawread_dir, 'cns-scatter', 'scattered.json')
        make_task = PypeTask(
                inputs = {
                    'gathered': las_fopfn_plf,
                    'db': raw_reads_db_plf,
                },
                outputs = {
                    'scattered': scattered_plf,
                },
                parameters = {
                    'db_prefix': 'raw_reads',
                    'config': config,
                },
        )
        task = make_task(pype_tasks.task_consensus_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        tasks, consensus_out = create_consensus_tasks(rawread_dir, scattered_plf)
        wf.addTasks(tasks)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        task, preads_fofn_plf = create_consensus_gather_task(os.path.join(rawread_dir, 'preads'), consensus_out)
        wf.addTask(task)

        rdir = os.path.join(rawread_dir, 'report')
        pre_assembly_report_plf = makePypeLocalFile(os.path.join(rdir, 'pre_assembly_stats.json'))
        parameters = dict(config)
        parameters['cwd'] = rdir
        make_task = PypeTask(
                inputs = {'length_cutoff_fn': length_cutoff_plf,
                          'raw_reads_db': raw_reads_db_plf,
                          'preads_fofn': preads_fofn_plf, },
                outputs = {'pre_assembly_report': pre_assembly_report_plf, },
                parameters = parameters,
        )
        task = make_task(pype_tasks.task_report_pre_assembly)
        wf.addTask(task)

        concurrent_jobs = config['cns_concurrent_jobs']
        wf.max_jobs = concurrent_jobs
        wf.refreshTargets(exitOnFailure=exitOnFailure)


    if config['target'] == 'pre-assembly':
        log.info('Quitting after stage-0 for "pre-assembly" target.')
        sys.exit(0)

    # build pread database
    if config['input_type'] == 'preads':
        preads_fofn_plf = makePypeLocalFile(os.path.join(pread_dir, 'preads-fofn-abs', os.path.basename(config['input_fofn'])))
        make_fofn_abs_task = PypeTask(inputs = {'i_fofn': rawread_fofn_plf},
                                     outputs = {'o_fofn': preads_fofn_plf},
                                     parameters = {},
        )
        fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, 'pdb_build_done') )
    parameters = {'work_dir': pread_dir,
                  'sge_option': config['sge_option_pda'],
                  'config_fn': input_config_fn,
                  'config': config}

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    preads_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) # Also .preads.*, of course.
    make_build_pdb_task  = PypeTask(inputs = {'preads_fofn': preads_fofn_plf },
                                    outputs = {'pdb_build_done': pdb_build_done,
                                               'preads_db': preads_db,
                                               'run_jobs': run_jobs,
                                    },
                                    parameters = parameters,
    )
    build_pdb_task = make_build_pdb_task(pype_tasks.task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done])


    preads_nblock = support.get_nblock(fn(preads_db))
    #### run daligner
    config['sge_option_da'] = config['sge_option_pda']

    scattered_plf = os.path.join(pread_dir, 'daligner-scatter', 'scattered.json')
    make_daligner_scatter = PypeTask(
            inputs = {
                'run_jobs_fn': run_jobs,
                'db_build_done': pdb_build_done,
            },
            outputs = {
                'scatter_fn': scattered_plf,
            },
            parameters = {
                'db_prefix': 'preads',
                'nblock': preads_nblock,
                'pread_aln': True,
                'config': config,
            },
    )
    task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
    wf.addTask(task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    daligner_tasks, daligner_out = create_daligner_tasks(pread_dir, scattered_plf)
    wf.addTasks(daligner_tasks)

    p_gathered_las_plf = makePypeLocalFile(os.path.join(pread_dir, 'gathered-las', 'gathered-las.txt'))
    parameters =  {
            'nblock': preads_nblock,
    }
    make_daligner_gather = PypeTask(
                inputs = daligner_out,
                outputs =  {'gathered': p_gathered_las_plf},
                parameters = parameters,
    )
    check_p_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
    wf.addTask(check_p_da_task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    # Merge .las files.
    config['sge_option_la'] = config['sge_option_pla']
    scattered_plf = os.path.join(pread_dir, 'merge-scatter', 'scattered.json')
    make_task = PypeTask(
            inputs = {
                'run_jobs': run_jobs,
                'gathered_las': p_gathered_las_plf,
            },
            outputs = {
                'scattered': scattered_plf,
            },
            parameters = {
                'db_prefix': 'preads',
                'config': config,
            },
    )
    task = make_task(pype_tasks.task_merge_scatter)
    wf.addTask(task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    merge_tasks, p_ids_merged_las = create_merge_tasks(pread_dir, scattered_plf)
    wf.addTasks(merge_tasks)
    task, las_fofn_plf, las_fopfn_plf = create_merge_gather_task(os.path.join(pread_dir, 'merge-gather'), p_ids_merged_las)
    wf.addTask(task)

    concurrent_jobs = config['ovlp_concurrent_jobs']
    wf.max_jobs = concurrent_jobs

    wf.refreshTargets(exitOnFailure=exitOnFailure)


    db2falcon_dir = os.path.join(pread_dir, 'db2falcon')
    db2falcon_done = makePypeLocalFile(os.path.join(db2falcon_dir, 'db2falcon_done'))
    preads4falcon_plf = makePypeLocalFile(os.path.join(db2falcon_dir, 'preads4falcon.fasta'))
    make_run_db2falcon = PypeTask(
               inputs = {'las_fofn_plf': las_fofn_plf,
                         'preads_db': preads_db,
                        },
               outputs =  {'db2falcon_done': db2falcon_done,
                           'preads4falcon': preads4falcon_plf,
                          },
               parameters = {'wd': db2falcon_dir,
                             'config': config,
                             'sge_option': config['sge_option_fc'],
                            },
    )
    wf.addTask(make_run_db2falcon(pype_tasks.task_run_db2falcon))

    falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, 'falcon_asm_done'))
    make_run_falcon_asm = PypeTask(
               inputs = {'db2falcon_done': db2falcon_done, 'db_file': preads_db,
                         'preads4falcon': preads4falcon_plf,
                         'las_fofn': las_fofn_plf,
                        },
               outputs =  {'falcon_asm_done': falcon_asm_done},
               parameters = {'wd': falcon_asm_dir,
                             'config': config,
                             'pread_dir': pread_dir,
                             'sge_option': config['sge_option_fc'],
               },
    )
    wf.addTask(make_run_falcon_asm(pype_tasks.task_run_falcon_asm))
    wf.refreshTargets()

    return falcon_asm_done
def dump_rawread_ids(self):
    rawread_db = fn( self.rawread_db )
    rawread_id_file = fn( self.rawread_id_file )
    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )
示例#55
0
def run(
    wf,
    config,
    rule_writer,
    config_fn,
    input_fofn_plf,
):
    """
    Preconditions (for now):
    * LOG
    * run_support.logger
    """
    parsed_config = io.deserialize(config_fn)
    if parsed_config != config:
        msg = 'Config from {!r} != passed config'.format(config_fn)
        raise Exception(msg)
    general_config = config['General']
    general_config_fn = os.path.join(os.path.dirname(config_fn),
                                     'General_config.json')
    io.serialize(general_config_fn, general_config)  # Some tasks use this.
    rawread_dir = '0-rawreads'
    pread_dir = '1-preads_ovl'
    falcon_asm_dir = '2-asm-falcon'

    for d in (rawread_dir, pread_dir, falcon_asm_dir):
        support.make_dirs(d)

    # only matter for parallel jobs
    job_defaults = config['job.defaults']
    exitOnFailure = bool(job_defaults.get('stop_all_jobs_on_failure', False))
    default_njobs = int(job_defaults.get('njobs', 7))
    wf.max_jobs = default_njobs

    assert general_config['input_type'] in (
        'raw', 'preads'), 'Invalid input_type=={!r}'.format(
            general_config['input_type'])

    # Store config as JSON, available to many tasks.

    if general_config['input_type'] == 'raw':
        parameters = {}

        # import sequences into daligner DB
        # calculate length_cutoff (if specified as -1)
        # split DB
        # run DBdust
        r_db_dust_fn = os.path.join(rawread_dir, 'build', 'raw_reads.db')
        length_cutoff_fn = os.path.join(rawread_dir, 'build', 'length_cutoff')
        wf.addTask(
            gen_task(
                script=pype_tasks.TASK_DB_BUILD_SCRIPT,
                inputs={
                    'config': general_config_fn,
                    'input_fofn': fn(input_fofn_plf),
                },
                outputs={
                    'length_cutoff': length_cutoff_fn,
                    'db': r_db_dust_fn,
                    # Also .raw_reads.*, of course. And dust track.
                },
                parameters=dict(),
                rule_writer=rule_writer,
                dist=Dist(NPROC=1),
            ))

        # run TANmask
        tan_uows_fn = os.path.join(rawread_dir, 'tan-split', 'tan-uows.json')
        tan_bash_template_fn = os.path.join(rawread_dir, 'tan-split',
                                            'bash_template.sh')
        wf.addTask(
            gen_task(
                script=pype_tasks.TASK_DB_TAN_SPLIT_SCRIPT,
                inputs={
                    'config': general_config_fn,
                    'db': r_db_dust_fn,
                },
                outputs={
                    'split': tan_uows_fn,
                    'bash_template': tan_bash_template_fn,
                },
                parameters={},
                rule_writer=rule_writer,
                dist=Dist(NPROC=1),
            ))

        gathered_fn = os.path.join(rawread_dir, 'tan-gathered',
                                   'gathered-done-files.json')
        gen_parallel_tasks(
            wf,
            rule_writer,
            tan_uows_fn,
            gathered_fn,
            run_dict=dict(
                bash_template_fn=tan_bash_template_fn,
                script=
                'fubar-TODO',  #pype_tasks.TASK_DB_TAN_APPLY_SCRIPT, # for snakemake stuff
                inputs={
                    'units_of_work':
                    '0-rawreads/tan-chunks/{tan0_id}/some-units-of-work.json',
                },
                outputs={
                    #'job_done': '0-rawreads/{dal0_id}/daligner.done',
                    'results':
                    '0-rawreads/tan-runs/{tan0_id}/some-done-files.json',
                },
                parameters={},
            ),
            dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.da']),
        )

        r_db_tan_fn = os.path.join(rawread_dir, 'tan-combine', 'raw_reads.db')
        wf.addTask(
            gen_task(
                script=pype_tasks.TASK_DB_TAN_COMBINE_SCRIPT,
                inputs={
                    'config': general_config_fn,
                    'db': r_db_dust_fn,
                    'gathered': gathered_fn,
                },
                outputs={
                    'new_db': r_db_tan_fn,
                },
                parameters={},
                rule_writer=rule_writer,
                dist=Dist(local=True),
            ))

        # run daligner
        wf.max_jobs = config['job.step.da'].get('njobs', default_njobs)
        #rawreads_db_fn = os.path.join(rawread_dir, 'raw_reads.db')
        daligner_all_units_fn = os.path.join(rawread_dir, 'daligner-split',
                                             'all-units-of-work.json')
        daligner_bash_template_fn = os.path.join(rawread_dir, 'daligner-split',
                                                 'daligner_bash_template.sh')
        params = dict(parameters)
        #params['db_prefix'] = 'raw_reads'
        #params['pread_aln'] = 0
        params['skip_checks'] = int(general_config.get('skip_checks', 0))
        params['wildcards'] = 'dal0_id'
        wf.addTask(
            gen_task(
                script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT,
                inputs={
                    'config': general_config_fn,
                    'db': r_db_tan_fn,
                    'length_cutoff': length_cutoff_fn,
                },
                outputs={
                    'split': daligner_all_units_fn,
                    'bash_template': daligner_bash_template_fn
                },
                parameters=params,
                rule_writer=rule_writer,
                dist=Dist(
                    local=True,
                    NPROC=4),  # really, NPROC=1, but we need to know the max
            ))

        gathered_fn = os.path.join(rawread_dir, 'daligner-gathered',
                                   'gathered-done-files.json')
        gen_parallel_tasks(
            wf,
            rule_writer,
            daligner_all_units_fn,
            gathered_fn,
            run_dict=dict(
                bash_template_fn=daligner_bash_template_fn,
                script=pype_tasks.
                TASK_DB_DALIGNER_APPLY_SCRIPT,  # for snakemake stuff
                inputs={
                    'units_of_work':
                    os.path.join(
                        rawread_dir,
                        'daligner-chunks/{dal0_id}/some-units-of-work.json'),
                },
                outputs={
                    'results':
                    os.path.join(
                        rawread_dir,
                        'daligner-runs/{dal0_id}/some-done-files.json'),
                },
                parameters={},
            ),
            dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.da']),
        )

        r_gathered_las_fn = os.path.join(rawread_dir, 'daligner-combine',
                                         'gathered-las.json')
        wf.addTask(
            gen_task(
                script=pype_tasks.TASK_DB_DALIGNER_COMBINE_SCRIPT,
                inputs={
                    'config': general_config_fn,
                    'db': r_db_tan_fn,
                    'gathered': gathered_fn,
                },
                outputs={
                    'las_paths': r_gathered_las_fn,
                },
                parameters={},
                rule_writer=rule_writer,
                #dist=Dist(NPROC=1, MB=4000, job_dict=config['job.step.da'])
                dist=Dist(local=True),
            ))

        # Merge .las files.
        wf.max_jobs = config['job.step.la'].get('njobs', default_njobs)
        las_merge_all_units_fn = os.path.join(rawread_dir, 'las-merge-split',
                                              'all-units-of-work.json')
        bash_template_fn = os.path.join(rawread_dir, 'las-merge-split',
                                        'las-merge-bash-template.sh')
        params = dict(parameters)
        params['db_prefix'] = 'raw_reads'
        params['wildcards'] = 'mer0_id'
        wf.addTask(
            gen_task(
                script=pype_tasks.TASK_DB_LAMERGE_SPLIT_SCRIPT,
                inputs={
                    'config': general_config_fn,
                    'las_paths': r_gathered_las_fn,
                },
                outputs={
                    'split': las_merge_all_units_fn,
                    'bash_template': bash_template_fn,
                },
                parameters=params,
                rule_writer=rule_writer,
                dist=Dist(local=True),
            ))

        gathered_fn = os.path.join(rawread_dir, 'las-merge-gathered',
                                   'gathered.json')
        gen_parallel_tasks(
            wf,
            rule_writer,
            las_merge_all_units_fn,
            gathered_fn,
            run_dict=dict(
                bash_template_fn=bash_template_fn,
                script=pype_tasks.
                TASK_DB_LAMERGE_APPLY_SCRIPT,  # for snakemake
                inputs={
                    #'las_paths': './0-rawreads/merge-scripts/{mer0_id}/las_paths.json',
                    #'merge_script': './0-rawreads/merge-scripts/{mer0_id}/merge-script.sh',
                    #'merged_las_json': './0-rawreads/merge-scripts/{mer0_id}/merged_las.json',
                    'units_of_work':
                    '0-rawreads/las-merge-chunks/{mer0_id}/some-units-of-work.json',
                },
                outputs={
                    #'merged_las': './0-rawreads/{mer0_id}/merged.las',
                    #'job_done': './0-rawreads/{mer0_id}/merge.done',
                    'results':
                    '0-rawreads/las-merge-runs/{mer0_id}/some-las-paths.json',
                },
                parameters={},
            ),
            dist=Dist(NPROC=1, job_dict=config['job.step.la']),
        )

        p_id2las_fn = os.path.join(rawread_dir, 'las-merge-combine',
                                   'p_id2las.json')
        las_fofn_fn = os.path.join(rawread_dir, 'las-merge-combine',
                                   'las_fofn.json')
        wf.addTask(
            gen_task(
                script=pype_tasks.TASK_DB_LAMERGE_COMBINE_SCRIPT,
                inputs={
                    'config': general_config_fn,
                    'gathered': gathered_fn,
                },
                outputs={
                    'block2las': p_id2las_fn,
                    'las_paths': las_fofn_fn,
                },
                parameters={},
                rule_writer=rule_writer,
                dist=Dist(local=True),
            ))

        if general_config['target'] == 'overlapping':
            sys.exit(0)

        # Produce new FOFN of preads fasta, based on consensus of overlaps.
        wf.max_jobs = config['job.step.cns'].get('njobs', default_njobs)

        split_fn = os.path.join(rawread_dir, 'cns-split', 'split.json')
        bash_template_fn = os.path.join(rawread_dir, 'cns-split',
                                        'consensus-bash-template.sh')
        params = dict(parameters)
        params['wildcards'] = 'cns0_id,cns0_id2'
        wf.addTask(
            gen_task(
                script=pype_tasks.TASK_CONSENSUS_SPLIT_SCRIPT,
                inputs={
                    'p_id2las': p_id2las_fn,
                    'raw_reads_db': r_db_tan_fn,
                    'length_cutoff': length_cutoff_fn,
                    'config': general_config_fn,
                },
                outputs={
                    'split': split_fn,
                    'bash_template': bash_template_fn,
                },
                parameters=params,
                rule_writer=rule_writer,
                dist=Dist(local=True),
            ))

        gathered_fn = os.path.join(rawread_dir, 'cns-gather', 'gathered.json')
        gen_parallel_tasks(
            wf,
            rule_writer,
            split_fn,
            gathered_fn,
            run_dict=dict(
                bash_template_fn=bash_template_fn,
                script=pype_tasks.
                TASK_CONSENSUS_TASK_SCRIPT,  # for snakemake only
                inputs={
                    #'las': '0-rawreads/cns-split/{cns0_id}/merged.{cns0_id2}.las',
                    #'db': r_db_tan_fn,
                    #'length_cutoff': length_cutoff_fn,
                    #'config': general_config_fn,
                    'units_of_work':
                    '0-rawreads/cns-chunks/{cns0_id}/some-units-of-work.json',
                },
                outputs={
                    #'fasta': '0-rawreads/consensus/{cns0_id}/consensus.{cns0_id2}.fasta',
                    'results':
                    '0-rawreads/cns-runs/{cns0_id}/some-done-files.json',
                },
                parameters={},
            ),
            dist=Dist(NPROC=6, job_dict=config['job.step.cns']),
        )
        preads_fofn_fn = os.path.join(rawread_dir, 'preads',
                                      'input_preads.fofn')
        wf.addTask(
            gen_task(
                script=pype_tasks.TASK_CONSENSUS_GATHER_SCRIPT,
                inputs={
                    'gathered': gathered_fn,
                },
                outputs={
                    'preads_fofn': preads_fofn_fn,
                },
                parameters=parameters,  #{},
                rule_writer=rule_writer,
                dist=Dist(local=True),
            ))

        rdir = os.path.join(rawread_dir, 'report')
        pre_assembly_report_fn = os.path.join(rdir, 'pre_assembly_stats.json')
        params = dict(parameters)
        params['length_cutoff_user'] = general_config['length_cutoff']
        params['genome_length'] = general_config[
            'genome_size']  # note different name; historical
        wf.addTask(
            gen_task(
                script=pype_tasks.TASK_REPORT_PRE_ASSEMBLY_SCRIPT,
                inputs={
                    'length_cutoff': length_cutoff_fn,
                    'raw_reads_db': r_db_tan_fn,
                    'preads_fofn': preads_fofn_fn,
                    'config': general_config_fn,
                },
                outputs={
                    'pre_assembly_report': pre_assembly_report_fn,
                },
                parameters=params,
                rule_writer=rule_writer,
                dist=Dist(local=True),
            ))

    if general_config['target'] == 'pre-assembly':
        LOG.info('Quitting after stage-0 for "pre-assembly" target.')
        sys.exit(0)

    # build pread database
    if general_config['input_type'] == 'preads':
        """
        preads_fofn_plf = makePypeLocalFile(os.path.join(
            pread_dir, 'preads-fofn-abs', os.path.basename(general_config['input_fofn'])))
        make_fofn_abs_task = PypeTask(inputs={'i_fofn': input_fofn_plf},
                                      outputs={'o_fofn': preads_fofn_plf},
                                      parameters={},
                                      )
        fofn_abs_task = make_fofn_abs_task(
            pype_tasks.task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])
        """
        raise Exception('TODO')

    pdb_build_done = os.path.join(pread_dir, 'pdb_build_done')
    run_jobs_fn = os.path.join(pread_dir, 'run_jobs.sh')
    preads_db_fn = os.path.join(pread_dir, 'build', 'preads.db')
    length_cutoff_pr_fn = os.path.join(pread_dir, 'build', 'length_cutoff')

    wf.addTask(
        gen_task(
            script=pype_tasks.TASK_DB_BUILD_SCRIPT,
            inputs={
                'config': general_config_fn,
                'input_fofn': preads_fofn_fn,
            },
            outputs={
                'length_cutoff': length_cutoff_pr_fn,
                'db': preads_db_fn,
                # Also .preads.*, of course.
            },
            parameters=dict(),
            rule_writer=rule_writer,
            dist=Dist(NPROC=1),
        ))

    # run daligner
    wf.max_jobs = config['job.step.pda'].get('njobs', default_njobs)
    daligner_all_units_fn = os.path.join(pread_dir, 'daligner-split',
                                         'all-units-of-work.json')
    daligner_bash_template_fn = os.path.join(pread_dir, 'daligner-split',
                                             'daligner_bash_template.sh')
    params = dict(parameters)
    params['skip_checks'] = int(general_config.get('skip_checks', 0))
    params['wildcards'] = 'dal1_id'
    wf.addTask(
        gen_task(
            script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT,
            inputs={
                'config': general_config_fn,
                'db': preads_db_fn,  #not tan, yet
                'length_cutoff': length_cutoff_pr_fn,
            },
            outputs={
                'split': daligner_all_units_fn,
                'bash_template': daligner_bash_template_fn
            },
            parameters=params,
            rule_writer=rule_writer,
            dist=Dist(local=True,
                      NPROC=4),  # really, NPROC=1, but we need to know the max
        ))

    gathered_fn = os.path.join(pread_dir, 'daligner-gathered',
                               'gathered-done-files.json')
    gen_parallel_tasks(
        wf,
        rule_writer,
        daligner_all_units_fn,
        gathered_fn,
        run_dict=dict(
            bash_template_fn=daligner_bash_template_fn,
            script=pype_tasks.
            TASK_DB_DALIGNER_APPLY_SCRIPT,  # for snakemake stuff
            inputs={
                'units_of_work':
                os.path.join(
                    pread_dir,
                    'daligner-chunks/{dal1_id}/some-units-of-work.json'),
            },
            outputs={
                'results':
                os.path.join(pread_dir,
                             'daligner-runs/{dal1_id}/some-done-files.json'),
            },
            parameters={},
        ),
        dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.pda']),
    )

    gathered_las_fn = os.path.join(pread_dir, 'daligner-combine',
                                   'gathered-las.json')
    wf.addTask(
        gen_task(
            script=pype_tasks.TASK_DB_DALIGNER_COMBINE_SCRIPT,
            inputs={
                'config': general_config_fn,
                'db': preads_db_fn,  #r_db_tan_fn,
                'gathered': gathered_fn,
            },
            outputs={
                'las_paths': gathered_las_fn,
            },
            parameters={},
            rule_writer=rule_writer,
            #dist=Dist(NPROC=1, MB=4000, job_dict=config['job.step.pda'])
            dist=Dist(local=True),
        ))

    # Merge .las files.
    wf.max_jobs = config['job.step.pla'].get('njobs', default_njobs)
    las_merge_all_units_fn = os.path.join(pread_dir, 'las-merge-split',
                                          'all-units-of-work.json')
    bash_template_fn = os.path.join(pread_dir, 'las-merge-split',
                                    'las-merge-bash-template.sh')
    params = dict(parameters)
    params['db_prefix'] = 'preads'
    params['wildcards'] = 'mer1_id'
    wf.addTask(
        gen_task(
            script=pype_tasks.TASK_DB_LAMERGE_SPLIT_SCRIPT,
            inputs={
                'config': general_config_fn,
                'las_paths': gathered_las_fn,
            },
            outputs={
                'split': las_merge_all_units_fn,
                'bash_template': bash_template_fn,
            },
            parameters=params,
            rule_writer=rule_writer,
            dist=Dist(local=True),
        ))

    gathered_fn = os.path.join(pread_dir, 'las-merge-gathered',
                               'gathered.json')
    gen_parallel_tasks(
        wf,
        rule_writer,
        las_merge_all_units_fn,
        gathered_fn,
        run_dict=dict(
            bash_template_fn=bash_template_fn,
            script=pype_tasks.TASK_DB_LAMERGE_APPLY_SCRIPT,  # for snakemake
            inputs={
                'units_of_work':
                os.path.join(
                    pread_dir,
                    'las-merge-chunks/{mer0_id}/some-units-of-work.json'),
            },
            outputs={
                'results':
                os.path.join(pread_dir,
                             'las-merge-runs/{mer0_id}/some-las-paths.json'),
            },
            parameters={},
        ),
        dist=Dist(NPROC=1, job_dict=config['job.step.la']),
    )

    p_id2las_fn = os.path.join(pread_dir, 'las-merge-combine',
                               'block2las.json')
    las_fofn_fn = os.path.join(pread_dir, 'las-merge-combine', 'las_fofn.json')
    wf.addTask(
        gen_task(
            script=pype_tasks.TASK_DB_LAMERGE_COMBINE_SCRIPT,
            inputs={
                'config': general_config_fn,
                'gathered': gathered_fn,
            },
            outputs={
                'block2las': p_id2las_fn,
                'las_paths': las_fofn_fn,
            },
            parameters={},
            rule_writer=rule_writer,
            dist=Dist(local=True),
        ))

    wf.max_jobs = config['job.step.asm'].get('njobs', default_njobs)
    db2falcon_dir = os.path.join(pread_dir, 'db2falcon')
    db2falcon_done_fn = os.path.join(db2falcon_dir, 'db2falcon_done')
    preads4falcon_fn = os.path.join(db2falcon_dir, 'preads4falcon.fasta')
    wf.addTask(
        gen_task(
            script=pype_tasks.TASK_RUN_DB_TO_FALCON_SCRIPT,
            inputs={
                'p_id2las': p_id2las_fn,
                'preads_db': preads_db_fn,
            },
            outputs={
                'job_done': db2falcon_done_fn,
                'preads4falcon': preads4falcon_fn,
            },
            parameters={},
            rule_writer=rule_writer,
            dist=Dist(NPROC=4, job_dict=config['job.step.asm']),
        ))

    falcon_asm_done_fn = os.path.join(falcon_asm_dir, 'falcon_asm_done')
    for key in ('overlap_filtering_setting', 'length_cutoff_pr',
                'fc_ovlp_to_graph_option'):
        parameters[key] = general_config[key]
    wf.addTask(
        gen_task(
            script=pype_tasks.TASK_RUN_FALCON_ASM_SCRIPT,
            inputs={
                'db2falcon_done': db2falcon_done_fn,
                'db_file': preads_db_fn,
                'preads4falcon_fasta': preads4falcon_fn,
                'las_fofn': las_fofn_fn,
                'config': general_config_fn,
            },
            outputs={'falcon_asm_done': falcon_asm_done_fn},
            parameters=parameters,
            rule_writer=rule_writer,
            dist=Dist(NPROC=4, job_dict=config['job.step.asm']),
        ))
    wf.refreshTargets()

    with io.cd('0-rawreads'):
        # for backwards-compatibility
        io.symlink('las-merge-combine', 'las-gather')