Exemplo n.º 1
0
def repeatmasker(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "12_repeatmasker")
    if not op.isdir(dirw): os.makedirs(dirw)
    os.chdir(dirw)

    species = None
    if args.species in ['Zmays', 'B73', 'PH207', 'W22', 'Mo17', 'PHB47']:
        species = 'maize'
    elif args.species == 'Osativa':
        species = 'rice'
    else:
        logging.error("%s not supported" % args.species)
        sys.exit(1)

    cmds = []
    cmds.append("cd %s" % dirw)
    cmds.append("RepeatMasker -pa %d -species %s -dir %s %s" %
                (args.p, species, dirw, fg)),
    cmds.append("parse.rm.pl -i 11_genome.fas.out -o 12.repeatmasker.tsv")

    pbsjob = PbsJob(queue='ram256g',
                    ppn=24,
                    walltime="10:00:00",
                    cmds="\n".join(cmds))
    fjob = op.join(dirg, "13.rm.pbs")
    pbsjob.write(fjob)
    logging.debug("Job script '%s' has been created" % fjob)
Exemplo n.º 2
0
def index(cfg, args):
    c = AttrDict(cfg['index'])
    c = check_cfg_index(c)
    if args.check:
        return 0
    os.chdir(c.dirw)

    jcmds = [[
        "cd %s" % c.dirw
        ],[
        "cd %s" % c.dirw
        ],[
        "cd %s" % c.dirw
    ]]
    bcfgs = [
        [dict(opt = 'bash')],
        [dict(opt = 'parallel', thread = c.pbs_ppns[1])],
        [dict(opt = 'bash')]
    ]
    
    assert c.njob == len(bcfgs) == len(jcmds), "not %d jobs" % c.njob
    jobs = []
    for i in range(c.njob):
        prefix = "%s.%d" % (c.job_prefix, i+1)
        jcfg = {
            'queue': c.pbs_queues[i],
            'ppn': c.pbs_ppns[i], 
            'walltime': c.pbs_walltimes[i],
            'mem': c.pbs_mems[i], 
            'email': c.pbs_email,
        }
        job = PbsJob.from_cfg(jcfg = jcfg, jcmds = jcmds[i], bcfgs = bcfgs[i],
                prefix = prefix, njob = len(bcfgs[i]), 
                bash = c.bash, parallel = c.parallel)
        jobs.append(job)
 
    t = Table.read(c.ilist, format = 'ascii.tab')
    nrow = len(t)
    gts = [t['genotype'][x] for x in range(nrow) if t['type'][x] == 'Inbred']
    gts = set(gts)
    logging.debug("creating pseudo-refs for %d genomes" % len(gts))
    print(" ".join(gts))
    for gt in gts:
        diro = "%s/%s" % (c.outdirs[0], gt)
        mkdir(diro)
        jobs[0].subjobs[0].add_cmd("%s consensus -f %s %s -s %s \
                -c %s/25.chain -o %s/11_genome.fas" % \
                (c.bcftools, c.genome, c.vcf, gt, diro, diro))
        #jobs[1].subjobs[0].add_cmd("genome fasta %s" % diro)
        #jobs[0].subjobs[0].add_cmd("genome blat %s" % diro)
        #jobs[0].subjobs[0].add_cmd("genome bwa %s" % diro)
        jobs[1].subjobs[0].add_cmd("genome bowtie %s" % diro)
        jobs[2].subjobs[0].add_cmd("genome hisat %s" % diro)
   
    for job in jobs:
        job.write()
    fj = "%s.sh" % c.job_prefix
    create_job_chain([job.fname for job in jobs], fj)
    logging.debug("job chain with %s jobs was created: %s" % (c.njob, fj))
Exemplo n.º 3
0
def repeatmasker(args):
    dirg, fg = get_genomedir(args.species)
    dirw = op.join(dirg, "12_repeatmasker")
    if not op.isdir(dirw): os.makedirs(dirw)
    os.chdir(dirw)

    species = None
    if args.species in ['Zmays', 'B73', 'PH207', 'W22', 'Mo17', 'PHB47']:
        species = 'maize'
    elif args.species == 'Osativa':
        species = 'rice'
    else:
        logging.error("%s not supported" % args.species)
        sys.exit(1)
    
    cmds = []
    cmds.append("cd %s" % dirw)
    cmds.append("RepeatMasker -pa %d -species %s -dir %s %s" % (args.p, species, dirw, fg)),
    cmds.append("parse.rm.pl -i 11_genome.fas.out -o 12.repeatmasker.tsv")
    
    pbsjob = PbsJob(queue = 'ram256g', ppn = 24, walltime = "10:00:00", cmds = "\n".join(cmds))
    fjob = op.join(dirg, "13.rm.pbs")
    pbsjob.write(fjob)
    logging.debug("Job script '%s' has been created" % fjob)
Exemplo n.º 4
0
def hisat(cfg, check):
    cfg = cfg['hisat']
    dirw, ilist, olist, jobpre, diro1, diro2 = \
            cfg['dirw'], cfg['ilist'], cfg['olist'], cfg['job_prefix'], \
            cfg['outdir1'], cfg['outdir2']
    paired = cfg.getboolean('paired')
    temp_dir = cfg['temp_dir']
    ref_gatk = cfg['ref_gatk']
    gatk = cfg['gatk']
    db_hisat, hisat, samtools, parallel = \
            cfg['db_hisat'], cfg['hisat'], cfg['samtools'], cfg['parallel']
    pbs_template, pbs_queue, pbs_walltime, pbs_ppn, pbs_email = \
            cfg['pbs_template'], cfg['pbs_queue'], cfg['pbs_walltime'], \
            cfg['pbs_ppn'], cfg['pbs_email']
    if check:
        hisat_check(dirw, ilist, olist, diro1, diro2, paired)
        sys.exit(0)
    
    if not op.isdir(dirw): os.makedirs(dirw)
    os.chdir(dirw)
    assert op.isfile(ilist), "%s not exist" % ilist
    ary = np.genfromtxt(ilist, names = True, dtype = object, delimiter = "\t")
    fo1, fo2, fo2b, fo3 = ["%s.%s.sh" % (jobpre, i) for i in \
            ['1.hisat','2.bam','2.bamidx','3.stat']]
    fho1, fho2, fho2b, fho3 = [open(x, "w") for x in [fo1, fo2, fo2b, fo3]]
    for diro in [diro1, diro2]:
        if not op.isdir(diro): 
            os.makedirs(diro)
    jgatk = "java -jar %s" % gatk
    pbs_queues = pbs_queue.split(",")
    pbs_ppns = pbs_ppn.split(",")
    pbs_walltimes = pbs_walltime.split(",")
    for row in ary:
        row = [str(x, 'utf-8') for x in list(row)]
        sid = row[0]
        pre1= "%s/%s" % (diro1, sid)
        fsam = "%s.sam" % pre1
        fbam = "%s.bam" % pre1
        if paired:
            f1r, f2r, rc, f1p, f1u, f2p, f2u, rrc, rc1, rc2 = row[1:11]
            if not op.isfile(fsam):
                fho1.write("%s -p %s -x %s -q -1 %s -2 %s -U %s,%s \
                        --rg-id %s --rg SM:%s -S %s.sam\n" % \
                        (hisat, pbs_ppns[0], db_hisat, f1p, f2p, f1u, f2u, sid, sid, pre1))
        else:
            fr, rc, ft, rrc = row[1:5]
            if not op.isfile(fsam):
                fho1.write("%s -p %s -x %s -q -U %s \
                        --rg-id %s --rg SM:%s -S %s.sam\n" % \
                        (hisat, pbs_ppns[0], db_hisat, ft, sid, sid, pre1))
        if not op.isfile(fbam):
            #fho2.write("$PTOOL/picard.jar SortSam I=%s.sam \
            #        O=%s.bam SORT_ORDER=coordinate\n" % (pre1, pre1))
            #fho2.write("$PTOOL/picard.jar BuildBamIndex INPUT=%s.bam\n" \
            #        % pre1)
            fho2.write("%s sort -m 2500M -O bam -o %s.bam %s.sam\n" % (samtools, pre1, pre1))
            fho2b.write("%s index %s.bam\n" % (samtools, pre1))
        pre2 = "%s/%s" % (diro2, sid)
        #fho3.write("%s -T IndelRealigner -R %s \
        #        -I %s.bam -U ALLOW_N_CIGAR_READS \
        #        -targetIntervals %s -known %s -o %s.bam\n" % \
        #        (jgatk, ref_gatk, pre1, frta, fvcf, pre2))
        fho3.write("$PTOOL/picard.jar CollectAlignmentSummaryMetrics \
                R=%s I=%s.bam O=%s.sum.txt\n" % \
                (ref_gatk, pre1, pre2))
        fho3.write("$PTOOL/picard.jar CollectInsertSizeMetrics \
                INPUT=%s.bam OUTPUT=%s.ins.txt HISTOGRAM_FILE=%s.hist.pdf\n" \
                % (pre1, pre2, pre2))
    
    cmds = [[
        "cd %s" % dirw,
        "bash %s" % fo1
    ], [
        "cd %s" % dirw,
        "%s -j %s < %s" % (parallel, pbs_ppns[1], fo2),
        "%s -j %s < %s" % (parallel, pbs_ppns[1], fo2b)
    ], [
        "module load picard/2.3.0",
        "export _JAVA_OPTIONS='-Djava.io.tmpdir=%s'" % temp_dir,
        "cd %s" % dirw,
        "%s -j %s < %s" % (parallel, pbs_ppns[2], fo3)
    ]]
    njob = len(cmds)
    assert len(pbs_walltimes) == njob, "not %d jobs" % njob
    assert len(pbs_ppns) == njob, "not %d jobs" % njob

    fjobs = ["%s.%s.pbs" % (jobpre, chr(97+i)) for i in range(njob)]
    for i in range(njob):
        pbsjob = PbsJob(queue = pbs_queues[i],
                ppn = pbs_ppns[i],
                walltime = pbs_walltimes[i],
                email = pbs_email,
                cmds = "\n".join(cmds[i])
        )
        fjob = "%s.pbs" % jobpre
        pbsjob.write(fjobs[i])
        
    logging.debug("%s job scripts were created: %s" % (njob, ", ".join(fjobs)))
    logging.debug("qsub %s" % fjobs[0])
    logging.debug("qsub -W depend=afterok:??? %s" % fjobs[1])
Exemplo n.º 5
0
def fq_trim(cfg, args):
    c = AttrDict(cfg['fastq_trim'])
    c = check_cfg_fqtrim(c)
    if args.check:
        fq_trim_check(c)
        return 0
    os.chdir(c.dirw)
 
    jcmds = [[
        "export _JAVA_OPTIONS='-Djava.io.tmpdir=%s'" % c.temp_dir,
        "cd %s" % c.dirw
    ]]
    bcfgs = [
        [dict(opt = 'parallel', thread = c.pbs_ppns[0]),
        dict(opt = 'parallel', thread = c.pbs_ppns[0]),
        dict(opt = 'parallel', thread = c.pbs_ppns[0])]
    ]
    
    assert c.njob == len(bcfgs) == len(jcmds), "not %d jobs" % c.njob
    jobs = []
    for i in range(c.njob):
        prefix = "%s.%d" % (c.job_prefix, i+1)
        jcfg = {
            'queue': c.pbs_queues[i],
            'ppn': c.pbs_ppns[i], 
            'walltime': c.pbs_walltimes[i],
            'email': c.pbs_email,
        }
        job = PbsJob.from_cfg(jcfg = jcfg, jcmds = jcmds[i], bcfgs = bcfgs[i],
                prefix = prefix, njob = len(bcfgs[i]), 
                bash = c.bash, parallel = c.parallel)
        jobs.append(job)
    
    t = Table.read(c.ilist, format = 'ascii.tab')
    nrow = len(t)
    if c.paired:
        for i in range(nrow):
            sid, f1, f2 = t['sid'][i], t['Readfile1'][i], t['Readfile2'][i]
            assert op.isfile(f1), "%s not there" % f1
            assert op.isfile(f2), "%s not there" % f2
            jobs[0].subjobs[0].add_cmd("%s -o %s --extract -f fastq %s %s" % \
                    (c.fastqc, c.outdirs[0], f1, f2))
            f11, f12, f21, f22 = ["%s/%s_%s.fq.gz" % (c.outdirs[1], sid, x) \
                    for x in ['1.PE', '1.SE', '2.PE', '2.SE']]
            jobs[0].subjobs[1].add_cmd("java -Xmx2500M -jar %s PE -threads 4 \
                    %s %s %s %s %s %s ILLUMINACLIP:%s:2:30:10:8:no \
                    LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:35" % \
                    (c.trimmomatic, f1, f2, f11, f12, f21, f22, c.adapter))
            jobs[0].subjobs[2].add_cmd("%s -o %s --extract -f fastq %s %s %s %s" % \
                    (c.fastqc, c.outdirs[2], f11, f12, f21, f22))
    else:
        for i in range(nrow):
            sid, f1 = t['sid'][i], t['Readfile'][i]
            assert op.isfile(f1), "%s not there" % f1
            jobs[0].subjobs[0].add_cmd("%s -o %s --extract -f fastq %s" % \
                    (c.fastqc, c.outdirs[0], f1))
            fo = "%s/%s.fq.gz" % (c.outdirs[1], sid)
            jobs[0].subjobs[1].add_cmd("java -Xmx2500M -jar %s SE -threads 4 \
                    %s %s ILLUMINACLIP:%s:2:30:10:8:no \
                    LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:35" % \
                    (c.trimmomatic, f1, fo, c.adapter))
            jobs[0].subjobs[2].add_cmd("%s -o %s --extract -f fastq %s" % \
                    (c.fastqc, c.outdirs[2], fo))
    
    for job in jobs:
        job.write()
    fj = "%s.sh" % c.job_prefix
    create_job_chain([job.fname for job in jobs], fj)
    logging.debug("job chain with %s jobs was created: %s" % (c.njob, fj))
Exemplo n.º 6
0
def hisat(cfg, check):
    cfg = cfg['hisat']
    dirw, ilist, olist, jobpre, diro1, diro2 = \
            cfg['dirw'], cfg['ilist'], cfg['olist'], cfg['job_prefix'], \
            cfg['outdir1'], cfg['outdir2']
    paired = cfg.getboolean('paired')
    temp_dir = cfg['temp_dir']
    ref_gatk = cfg['ref_gatk']
    gatk = cfg['gatk']
    db_hisat, hisat, samtools, parallel = \
            cfg['db_hisat'], cfg['hisat'], cfg['samtools'], cfg['parallel']
    pbs_template, pbs_queue, pbs_walltime, pbs_ppn, pbs_email = \
            cfg['pbs_template'], cfg['pbs_queue'], cfg['pbs_walltime'], \
            cfg['pbs_ppn'], cfg['pbs_email']
    if check:
        hisat_check(dirw, ilist, olist, diro1, diro2, paired)
        sys.exit(0)

    if not op.isdir(dirw): os.makedirs(dirw)
    os.chdir(dirw)
    assert op.isfile(ilist), "%s not exist" % ilist
    ary = np.genfromtxt(ilist, names=True, dtype=object, delimiter="\t")
    fo1, fo2, fo2b, fo3 = ["%s.%s.sh" % (jobpre, i) for i in \
            ['1.hisat','2.bam','2.bamidx','3.stat']]
    fho1, fho2, fho2b, fho3 = [open(x, "w") for x in [fo1, fo2, fo2b, fo3]]
    for diro in [diro1, diro2]:
        if not op.isdir(diro):
            os.makedirs(diro)
    jgatk = "java -jar %s" % gatk
    pbs_queues = pbs_queue.split(",")
    pbs_ppns = pbs_ppn.split(",")
    pbs_walltimes = pbs_walltime.split(",")
    for row in ary:
        row = [str(x, 'utf-8') for x in list(row)]
        sid = row[0]
        pre1 = "%s/%s" % (diro1, sid)
        fsam = "%s.sam" % pre1
        fbam = "%s.bam" % pre1
        if paired:
            f1r, f2r, rc, f1p, f1u, f2p, f2u, rrc, rc1, rc2 = row[1:11]
            if not op.isfile(fsam):
                fho1.write("%s -p %s -x %s -q -1 %s -2 %s -U %s,%s \
                        --rg-id %s --rg SM:%s -S %s.sam\n"                                                           % \
                        (hisat, pbs_ppns[0], db_hisat, f1p, f2p, f1u, f2u, sid, sid, pre1))
        else:
            fr, rc, ft, rrc = row[1:5]
            if not op.isfile(fsam):
                fho1.write("%s -p %s -x %s -q -U %s \
                        --rg-id %s --rg SM:%s -S %s.sam\n"                                                           % \
                        (hisat, pbs_ppns[0], db_hisat, ft, sid, sid, pre1))
        if not op.isfile(fbam):
            #fho2.write("$PTOOL/picard.jar SortSam I=%s.sam \
            #        O=%s.bam SORT_ORDER=coordinate\n" % (pre1, pre1))
            #fho2.write("$PTOOL/picard.jar BuildBamIndex INPUT=%s.bam\n" \
            #        % pre1)
            fho2.write("%s sort -m 2500M -O bam -o %s.bam %s.sam\n" %
                       (samtools, pre1, pre1))
            fho2b.write("%s index %s.bam\n" % (samtools, pre1))
        pre2 = "%s/%s" % (diro2, sid)
        #fho3.write("%s -T IndelRealigner -R %s \
        #        -I %s.bam -U ALLOW_N_CIGAR_READS \
        #        -targetIntervals %s -known %s -o %s.bam\n" % \
        #        (jgatk, ref_gatk, pre1, frta, fvcf, pre2))
        fho3.write("$PTOOL/picard.jar CollectAlignmentSummaryMetrics \
                R=%s I=%s.bam O=%s.sum.txt\n"                                              % \
                (ref_gatk, pre1, pre2))
        fho3.write("$PTOOL/picard.jar CollectInsertSizeMetrics \
                INPUT=%s.bam OUTPUT=%s.ins.txt HISTOGRAM_FILE=%s.hist.pdf\n" \
                % (pre1, pre2, pre2))

    cmds = [["cd %s" % dirw, "bash %s" % fo1],
            [
                "cd %s" % dirw,
                "%s -j %s < %s" % (parallel, pbs_ppns[1], fo2),
                "%s -j %s < %s" % (parallel, pbs_ppns[1], fo2b)
            ],
            [
                "module load picard/2.3.0",
                "export _JAVA_OPTIONS='-Djava.io.tmpdir=%s'" % temp_dir,
                "cd %s" % dirw,
                "%s -j %s < %s" % (parallel, pbs_ppns[2], fo3)
            ]]
    njob = len(cmds)
    assert len(pbs_walltimes) == njob, "not %d jobs" % njob
    assert len(pbs_ppns) == njob, "not %d jobs" % njob

    fjobs = ["%s.%s.pbs" % (jobpre, chr(97 + i)) for i in range(njob)]
    for i in range(njob):
        pbsjob = PbsJob(queue=pbs_queues[i],
                        ppn=pbs_ppns[i],
                        walltime=pbs_walltimes[i],
                        email=pbs_email,
                        cmds="\n".join(cmds[i]))
        fjob = "%s.pbs" % jobpre
        pbsjob.write(fjobs[i])

    logging.debug("%s job scripts were created: %s" % (njob, ", ".join(fjobs)))
    logging.debug("qsub %s" % fjobs[0])
    logging.debug("qsub -W depend=afterok:??? %s" % fjobs[1])
Exemplo n.º 7
0
def mapping(cfg, args):
    c = AttrDict(cfg['mapping'])
    c = check_cfg_mapping(c)
    if args.check:
        mapping_check(c)
        return 0
    os.chdir(c.dirw)

    jcmds = [[
        "cd %s" % c.dirw,
    ], [
        "cd %s" % c.dirw,
    ], [
        "cd %s" % c.dirw,
    ]]
    bcfgs = [
        [dict(opt='bash')],
        [dict(opt='parallel', thread=c.pbs_ppns[1])],
        [
            dict(opt='bash'),
            dict(opt='parallel', thread=c.pbs_ppns[2]),
        ],
    ]

    assert c.njob == len(bcfgs) == len(jcmds), "not %d jobs" % c.njob
    jobs = []
    for i in range(c.njob):
        prefix = "%s.%d" % (c.job_prefix, i + 1)
        jcfg = {
            'queue': c.pbs_queues[i],
            'ppn': c.pbs_ppns[i],
            'walltime': c.pbs_walltimes[i],
            'email': c.pbs_email,
        }
        job = PbsJob.from_cfg(jcfg=jcfg,
                              jcmds=jcmds[i],
                              bcfgs=bcfgs[i],
                              prefix=prefix,
                              njob=len(bcfgs[i]),
                              bash=c.bash,
                              parallel=c.parallel)
        jobs.append(job)

    t = Table.read(c.ilist, format='ascii.tab')
    nrow = len(t)
    for i in range(nrow):
        sid = t['sid'][i]
        pre1 = "%s/%s" % (c.outdirs[0], sid)
        fsam = "%s.sam" % pre1
        input_str = ''
        if c.paired:
            f1p = t["TrimmedReadFile1Paired"][i]
            f1u = t["TrimmedReadFile1Unpaired"][i]
            f2p = t["TrimmedReadFile2Paired"][i]
            f2u = t["TrimmedReadFile2Unpaired"][i]
            if c.mapper == 'hisat2' or c.mapper == 'bowtie2':
                input_str = "-1 %s -2 %s -U %s,%s" % (f1p, f2p, f1u, f2u)
            elif c.mapper == 'bwa':
                input_str = "%s %s" % (f1p, f2p)
        else:
            ft = t["TrimmedReadFile"][i]
            if c.mapper == 'hisat2' or c.mapper == 'bowtie2':
                input_str = "-U %s" % ft
            elif c.mapper == 'bwa':
                input_str = "%s" % ft
        if c.mapper == 'bwa':
            jobs[0].subjobs[0].add_cmd("%s mem -t %s %s %s \
                    -R '@RG\\tID:%s\\tSM:%s' -a > %s.sam"                                                          % \
                    (c.bwa, c.pbs_ppns[0], c.bwa_db, input_str, \
                    sid, sid, pre1))
        elif c.mapper == 'hisat2':
            jobs[0].subjobs[0].add_cmd("%s -p %s -x %s -q %s \
                    --no-spliced-alignment --rg-id %s --rg SM:%s -S %s.sam"                                                                            % \
                    (c.hisat2, c.pbs_ppns[0], c.hisat_db, input_str, \
                    sid, sid, pre1))
        elif c.mapper == 'bowtie2':
            jobs[0].subjobs[0].add_cmd("%s -p %s -x %s -q %s \
                    --rg-id %s --rg SM:%s --sensitive -S %s.sam"                                                                 % \
                    (c.bowtie2, c.pbs_ppns[0], c.bowtie_db, input_str, \
                    sid, sid, pre1))

        fbam = "%s.bam" % pre1
        jobs[1].subjobs[0].add_cmd("%s view -Sb %s.sam -o %s.raw.bam" % \
                (c.samtools, pre1, pre1))
        jobs[2].subjobs[0].add_cmd("%s sort -t %s -m 60GB %s.raw.bam -o %s.bam" % \
                (c.sambamba, c.pbs_ppns[2], pre1, pre1))
        #bcmds[2].append("%s index -t %s %s.bam" % (sambamba, pbs_ppns[2], pre1))

        pre2 = "%s/%s" % (c.outdirs[1], sid)
        jobs[2].subjobs[1].add_cmd("bam stat %s.bam --isize %s.ins.tsv > %s.tsv" % \
                (pre1, pre2, pre2))

    for job in jobs:
        job.write()
    fj = "%s.sh" % c.job_prefix
    create_job_chain([job.fname for job in jobs], fj)
    logging.debug("job chain with %s jobs was created: %s" % (c.njob, fj))
Exemplo n.º 8
0
def mapping(cfg, args):
    c = AttrDict(cfg['mapping'])
    c = check_cfg_mapping(c)
    if args.check:
        mapping_check(c)
        return 0
    os.chdir(c.dirw)

    jcmds = [[
        "cd %s" % c.dirw,
    ], [
        "cd %s" % c.dirw,
    ], [
        "cd %s" % c.dirw,
    ]]
    bcfgs = [
        [dict(opt = 'bash')], 
        [dict(opt = 'parallel', thread = c.pbs_ppns[1])],
        [dict(opt = 'bash'),
        dict(opt = 'parallel', thread = c.pbs_ppns[2]),
        ],
    ]

    assert c.njob == len(bcfgs) == len(jcmds), "not %d jobs" % c.njob
    jobs = []
    for i in range(c.njob):
        prefix = "%s.%d" % (c.job_prefix, i+1)
        jcfg = {
            'queue': c.pbs_queues[i],
            'ppn': c.pbs_ppns[i], 
            'walltime': c.pbs_walltimes[i],
            'email': c.pbs_email,
        }
        job = PbsJob.from_cfg(jcfg = jcfg, jcmds = jcmds[i], bcfgs = bcfgs[i],
                prefix = prefix, njob = len(bcfgs[i]), 
                bash = c.bash, parallel = c.parallel)
        jobs.append(job)
 
    t = Table.read(c.ilist, format = 'ascii.tab')
    nrow = len(t)
    for i in range(nrow):
        sid = t['sid'][i]
        pre1= "%s/%s" % (c.outdirs[0], sid)
        fsam = "%s.sam" % pre1
        input_str = ''
        if c.paired:
            f1p = t["TrimmedReadFile1Paired"][i]
            f1u = t["TrimmedReadFile1Unpaired"][i]
            f2p = t["TrimmedReadFile2Paired"][i]
            f2u = t["TrimmedReadFile2Unpaired"][i]
            if c.mapper == 'hisat2' or c.mapper == 'bowtie2':
                input_str = "-1 %s -2 %s -U %s,%s" % (f1p, f2p, f1u, f2u)
            elif c.mapper == 'bwa':
                input_str = "%s %s" % (f1p, f2p)
        else:
            ft = t["TrimmedReadFile"][i]
            if c.mapper == 'hisat2' or c.mapper == 'bowtie2':
                input_str = "-U %s" % ft
            elif c.mapper == 'bwa':
                input_str = "%s" % ft
        if c.mapper == 'bwa':
            jobs[0].subjobs[0].add_cmd("%s mem -t %s %s %s \
                    -R '@RG\\tID:%s\\tSM:%s' -a > %s.sam" % \
                    (c.bwa, c.pbs_ppns[0], c.bwa_db, input_str, \
                    sid, sid, pre1))
        elif c.mapper == 'hisat2':
            jobs[0].subjobs[0].add_cmd("%s -p %s -x %s -q %s \
                    --no-spliced-alignment --rg-id %s --rg SM:%s -S %s.sam" % \
                    (c.hisat2, c.pbs_ppns[0], c.hisat_db, input_str, \
                    sid, sid, pre1))
        elif c.mapper == 'bowtie2':
            jobs[0].subjobs[0].add_cmd("%s -p %s -x %s -q %s \
                    --rg-id %s --rg SM:%s --sensitive -S %s.sam" % \
                    (c.bowtie2, c.pbs_ppns[0], c.bowtie_db, input_str, \
                    sid, sid, pre1))
        
        fbam = "%s.bam" % pre1
        jobs[1].subjobs[0].add_cmd("%s view -Sb %s.sam -o %s.raw.bam" % \
                (c.samtools, pre1, pre1))
        jobs[2].subjobs[0].add_cmd("%s sort -t %s -m 60GB %s.raw.bam -o %s.bam" % \
                (c.sambamba, c.pbs_ppns[2], pre1, pre1))
        #bcmds[2].append("%s index -t %s %s.bam" % (sambamba, pbs_ppns[2], pre1))
    
        pre2 = "%s/%s" % (c.outdirs[1], sid)
        jobs[2].subjobs[1].add_cmd("bam stat %s.bam --isize %s.ins.tsv > %s.tsv" % \
                (pre1, pre2, pre2))
  
    for job in jobs:
        job.write()
    fj = "%s.sh" % c.job_prefix
    create_job_chain([job.fname for job in jobs], fj)
    logging.debug("job chain with %s jobs was created: %s" % (c.njob, fj))
Exemplo n.º 9
0
def run_ase(cfg):
    import pysam
    cfg = cfg['ase']
    dirw, ilist, olist, jobpre, diro = \
            cfg['dirw'], cfg['ilist'], cfg['olist'], cfg['job_prefix'], \
            cfg['outdir']
    f_fas = cfg['genome']
    paired = cfg.getboolean('paired')
    samtools, bcftools, parallel = \
            cfg['samtools'], cfg['bcftools'], cfg['parallel']
    target_vcf, gene_bed = cfg['targetvcf'], cfg['gene_bed']
    pbs_queue, pbs_walltime, pbs_ppn, pbs_email = \
            cfg['pbs_queue'], cfg['pbs_walltime'], cfg['pbs_ppn'], cfg['pbs_email']

    if not op.isdir(dirw): os.makedirs(dirw)
    os.chdir(dirw)
    assert op.isfile(ilist), "%s not exist" % ilist
    ary = np.genfromtxt(ilist, names = True, dtype = object, delimiter = "\t")
    dirj = "%s.jobs" % jobpre
    if op.isdir(dirj):
        os.system("rm -rf %s" % dirj)
    for do in [diro, dirj]:
        if not op.isdir(do): 
            os.makedirs(do)
    fj = "%s.sh" % jobpre
    fhj = open(fj, "w")
    i = 1
    for row in ary:
        row = [str(x, 'utf-8') for x in list(row)]
        sid = row[0]
        gt = row[3]
        if paired:
            fbam = row[11]
        else:
            fbam = row[5]
        pre = "%s/%s" % (diro, sid)
        cmds = [
            "mkdir %s" % pre,
            "bam2bed.py %s %s.1.bed" % (fbam, pre),
            "sort -T %s -k1,1 -k2,2n %s.1.bed > %s.2.sorted.bed" % (pre, pre, pre),
            "intersectBed -wa -wb -a %s.2.sorted.bed -b %s > %s.3.bed" % (pre, target_vcf, pre),
            "sort -T %s -k4,4 -k1,1 -k2,2n %s.3.bed > %s.4.sorted.bed" % (pre, pre, pre),
            "bed.ase.py %s.4.sorted.bed %s.5.tsv %s.6.bed" % (pre, pre, pre),
            "sort -T %s -k1,1 -k2,2n %s.6.bed > %s.7.sorted.bed" % (pre, pre, pre),
            "intersectBed -wa -wb -a %s -b %s.7.sorted.bed > %s.8.bed" % (gene_bed, pre, pre),
            "bed.ase.sum.py %s.5.tsv %s.8.bed %s.tsv" % (pre, pre, pre),
            "rm %s.[1-8].*" % pre,
            "rm -rf %s" % pre,
        ]
        fo = "%s/%03d.sh" % (dirj, i)
        fho = open(fo, "w")
        fho.write("\n".join(cmds) + "\n")
        fho.close()
        i += 1
        if not op.isfile("%s.bed" % pre) or not op.isfile("%s.tsv" % pre):
            fhj.write("bash %s\n" % fo)
    fhj.close()

    cmds = []
    cmds.append("cd %s" % dirw),
    cmds.append("%s -j %s < %s" % (parallel, pbs_ppn, fj))
    
    pbsjob = PbsJob(queue = pbs_queue, 
            ppn = pbs_ppn, 
            walltime = pbs_walltime,
            email = pbs_email,
            cmds = "\n".join(cmds)
    )
    fjob = "%s.pbs" % jobpre
    pbsjob.write(fjob)
    logging.debug("Job script '%s' has been created" % fjob)
Exemplo n.º 10
0
def mapping(cfg, args):
    c = AttrDict(cfg['mapping'])
    c = check_cfg_mapping(c)
    if args.check:
        mapping_check(c)
        return 0
    os.chdir(c.dirw)

    jcmds = [[
        "cd %s" % c.dirw,
    ], [
        #"export _JAVA_OPTIONS='-Djava.io.tmpdir=%s'" % temp_dir,
        "cd %s" % c.dirw,
    ]]
    bcfgs = [
        [dict(opt = 'bash'),
        dict(opt = 'parallel', thread = c.pbs_ppns[1])],
        [dict(opt = 'parallel', thread = c.pbs_ppns[1])]
    ]
    
    assert c.njob == len(bcfgs) == len(jcmds), "not %d jobs" % c.njob
    jobs = []
    for i in range(c.njob):
        prefix = "%s.%d" % (c.job_prefix, i+1)
        jcfg = {
            'queue': c.pbs_queues[i],
            'ppn': c.pbs_ppns[i], 
            'walltime': c.pbs_walltimes[i],
            'email': c.pbs_email,
        }
        job = PbsJob.from_cfg(jcfg = jcfg, jcmds = jcmds[i], bcfgs = bcfgs[i],
                prefix = prefix, njob = len(bcfgs[i]), 
                bash = c.bash, parallel = c.parallel)
        jobs.append(job)
 
    t = c.t
    nrow = len(t)
    for i in range(nrow):
        sid = str(t['sid'][i])
        genomes = t['genome'][i].split(",")
        #logging.debug("mapping %s to %s" % (sid, ", ".join(genomes)))
        for genome in genomes:
            dbpre = c.genomes[genome]['db']
            gff = c.genomes[genome]['gff']
            pre1= "%s/%s_%s" % (c.outdirs[0], sid, genome)
            input_str = ''
            if c.paired:
                f1p = t["TrimmedReadFile1Paired"][i]
                f1u = t["TrimmedReadFile1Unpaired"][i]
                f2p = t["TrimmedReadFile2Paired"][i]
                f2u = t["TrimmedReadFile2Unpaired"][i]
                if c.mapper == 'hisat2':
                    input_str = "-1 %s -2 %s -U %s,%s" % (f1p, f2p, f1u, f2u)
                elif c.mapper == 'tophat2':
                    input_str = "%s %s,%s,%s" % (f1p, f2p, f1u, f2u)
                elif c.mapper == 'star':
                    input_str = "%s %s" % (f1p, f2p)
            else:
                ft = t["TrimmedReadFile"][i]
                if c.mapper == 'hisat2':
                    input_str = "-U %s" % ft
                elif c.mapper == 'bowtie2' or c.mapper == 'star':
                    input_str = "%s" % ft
            fbam = "%s.bam" % pre1
            if c.mapper == 'hisat2':
                jobs[0].subjobs[0].add_cmd("%s -p %s -x %s -q %s \
                        --rg-id %s --rg SM:%s | samtools view -Sb - \
                        -o %s.raw.bam" % \
                        (c.hisat2, c.pbs_ppns[0], dbpre, input_str, \
                        sid, sid, pre1))
                jobs[0].subjobs[0].add_cmd("%s sort -t %s -m 60GB %s.raw.bam -o %s.bam" % \
                        (c.sambamba, c.pbs_ppns[0], pre1, pre1))
            elif c.mapper == 'tophat2':
                jobs[0].subjobs[0].add_cmd("mkdir -p %s" % pre1)
                jobs[0].subjobs[0].add_cmd("%s -p %s -G %s \
                        --rg-id %s --rg-sample %s %s %s -o %s" % \
                        (c.tophat2, c.pbs_ppns[0], gff, sid, sid, dbpre, input_str, pre1))
                fbam = "%s/accepted.bam" % pre1
            elif c.mapper == 'star':
                jobs[0].subjobs[0].add_cmd("%s --runThreadN %s --genomeDir %s \
                        --readFilesIn %s --readFilesCommand zcat \
                        --outFileNamePrefix %s. --outSAMtype BAM SortedByCoordinate" %\
                        (c.star, c.pbs_ppns[0], dbpre, input_str, pre1))
                fbam = "%s.Aligned.sortedByCoord.out.bam" % pre1
                jobs[0].subjobs[0].add_cmd("%s index -t %s %s" % (c.sambamba, c.pbs_ppns[0], fbam))
        
            pre2 = "%s/%s_%s" % (c.outdirs[1], sid, genome)
            jobs[0].subjobs[1].add_cmd("bam stat %s --isize %s.ins.tsv > %s.tsv" % \
                    (fbam, pre2, pre2))
        
            pre3 = "%s/%s_%s" % (c.outdirs[2], sid, genome)
            fsen = "%s.txt" % pre3
            fant = "%s.as.txt" % pre3
            #if not op.isfile(fsen) or os.stat(fsen).st_size == 0:
            sam_filter_tag = "-f 1" if c.paired else ""
            jobs[1].subjobs[0].add_cmd("%s view %s -F 256 %s | %s -r pos -s %s \
                        -t exon -i gene_id -m union -a 20 - %s > %s" % \
                        (c.samtools, sam_filter_tag, fbam, \
                        c.htseq, c.stranded, gff, fsen))
            #if not op.isfile(fant) or os.stat(fant).st_size == 0:
            #jobs[1].subjobs[0].add_cmd("%s view %s -F 256 %s | %s -r pos -s %s \
            #            -t exon -i gene_id -m union -a 20 - %s > %s" % \
            #            (c.samtools, sam_filter_tag, fbam, \
            #            c.htseq, 'yes', gff, fant))
   
    for job in jobs:
        job.write()
    fj = "%s.sh" % c.job_prefix
    create_job_chain([job.fname for job in jobs], fj)
    logging.debug("job chain with %s jobs was created: %s" % (c.njob, fj))