def sort_ubam(ubams): jobs = [] for ubam in ubams: ubam = util.File(ubam) obam = util.File( os.path.join(tmpdir, os.path.basename(ubam.path.rstrip('u.bam') + '.bam'))) job = sjm.Job('picard_sortUbam-%s' % ubam.prefix) job.memory = "20G" job.input = ubam job.output = obam job.append('picard_sortUbam.sh %s %s' % (job.input, job.output)) jobs.append(job) return jobs
def align_se(reads1, reads2): jobs = [] for i in range(0, len(reads1)): read1 = reads1[i] read2 = reads2[i] readfile1 = util.File(read1) readfile2 = util.File(read2) bamname = re.sub(r'[._][Rr]1', '', readfile1.prefix) + '.sorted.bam' bam = util.File(os.path.join(tmpdir, bamname)) job = sjm.Job('bwa_aln_se-%s' % readfile1.prefix) job.output = bam job.append('bwa_aln_se.sh %s %s %s %s' % (job.output, read1, read2, readgroup)) jobs.append(job) return jobs
def handle_file_thread(path): file = util.File(path) map_file_to_queue[path] = queue.Queue(0) while True: file.append(map_file_to_queue[path].get()) file.save() pass
def gatk_mvcf(pjobs, vcfout): vcfs = [pjob.output for pjob in pjobs] job = sjm.Job('gatk_CatVCF-%s' % (bamfile.prefix)) job.memory = "10G" job.output = util.File(os.path.join(outdir, vcfout)) job.append('gatk_catvcf.sh %s %s' % (job.output, ' '.join(vcfs))) job.depend(*pjobs) return job
def align_pe(reads1, reads2): jobs=[] for i in range(0, len(reads1)): read1 = reads1[i] read2 = reads2[i] readfile1 = util.File(read1) readfile2 = util.File(read2) if readfile1.path.endswith('.gz'): bamname = re.sub(r'[._][Rr]1', '', readfile1.prefix.rstrip('.fastq') ) + '.sorted.bam' else: bamname = re.sub(r'[._][Rr]1', '', readfile1.prefix ) + '.sorted.bam' bam = util.File( os.path.join(tmpdir, bamname) ) job = sjm.Job('bwa_aln_pe-%s' % readfile1.prefix) job.output = bam job.memory = "15G" job.append('bwa_aln_pe.sh %s %s %s %s'%(job.output, read1, read2, readgroup)) jobs.append(job) return jobs
def sam_flagstat(pjobs): jobs = [] for pjob in pjobs: bam=util.File(pjob.output) job=sjm.Job('samtools-flagstat-%s' % bam.prefix) job.memory = "10G" job.output = bam.chext("flagstat.txt") job.append('samtools flagstat %s > %s'%(bam, job.output)) job.depend(pjob) jobs.append(job) return jobs
def gatk_joint(pjobs): jobs = [] gvcfs = [pjob.output.path for pjob in pjobs] outvcf = util.File(os.path.join(args.outdir, args.output)) job = sjm.Job('GATK-joint-gt-%s' % outvcf.name) job.memory = "20G" job.output = outvcf job.append('gatk_gt_joint.sh %s %s' % (job.output, ' '.join(gvcfs))) job.depend(*pjobs) jobs.append(job) return jobs
def dedup_merge(pjobs, outbam): jobs = [] bams = [] for pjob in pjobs: bams.append(pjob.output.path) job = sjm.Job('picard_mdup-%s' % outbam ) job.memory = "20G" job.output = util.File( os.path.join(outdir, outbam) ) job.append('picard_mdup.sh %s %s'%(job.output, ' '.join(bams) ) ) job.depend(*pjobs) jobs.append(job) return jobs
def gatk_hc(pjobs): jobs = [] for pjob in pjobs: bamfile = util.File(pjob.output) job = sjm.Job('gatk_haplotypecaller-%s'%(bamfile.prefix)) job.memory = "40G" job.output = os.path.join(tmpdir, '%s.%s' % (bamfile.prefix, 'g.vcf.gz')) job.regions = pjob.regions job.append('gatk_hc.sh %s %s %s'%(job.output, bamfile.path, pjob.regions)) job.depend(pjob) jobs.append(job) return jobs
def gatk_gt(pjobs): jobs = [] for pjob in pjobs: gvcffile = util.File(pjob.output, iszipfile=True) job = sjm.Job('gatk_genotypeGVCFs-%s'%(gvcffile.prefix)) job.memory = "15G" job.output = os.path.join( tmpdir, '%s.%s' % (gvcffile.prefix, 'gt.vcf.gz') ) job.regions = pjob.regions job.append('gatk_gt.sh %s %s %s'%(job.output, gvcffile.path, pjob.regions)) job.depend(pjob) jobs.append(job) return jobs
def merge_aln(pjobs): jobs = [] for pjob in pjobs: alnbam = pjob.output ubam = pjob.input job = sjm.Job('picard_mergeBam-%s' % alnbam.name) job.memory = "10G" job.output = util.File(alnbam.path.rstrip('.aln.bam') + '.sort.bam') job.append('picard_mergeBam.sh %s %s %s' % (job.output, alnbam, ubam)) job.depend(pjob) jobs.append(job) return jobs
def gatk_recal(pjobs): jobs = [] for pjob in pjobs: bamfile = util.File(pjob.output) job = sjm.Job('gatk_recalibrate-%s'%(bamfile.prefix)) job.memory = "20G" job.output = os.path.join(tmpdir, '%s.%s' % (bamfile.prefix, 'recal.bam')) job.regions = pjob.regions job.append('gatk_recal.sh %s %s'%(job.output, bamfile.path)) job.depend(pjob) jobs.append(job) return jobs
def merge_bam(pjobs, out_prefix, suffix=None): ''' Caveat: If output bam exists, needs to apply "-f" to overwrite or task will abort. ''' bams = [] for pjob in pjobs: bams.append(pjob.output.path) job = sjm.Job('samtools_merge-%s' % suffix) job.memory = "5G" outname = os.path.join(tmpdir, '%s.%s.bam' % (out_prefix, suffix)) job.output = util.File(outname) job.append('samtools merge %s %s && samtools index %s' % (job.output, ' '.join(bams), job.output)) job.depend(*pjobs) return job
def merge_gvcf(gvcfs): jobs = [] gvcf_batches = [ gvcfs[x:x + args.merge_count] for x in range(0, len(gvcfs), args.merge_count) ] for i, gvcf_batch in enumerate(gvcf_batches): ogvcf = util.File( os.path.join(args.tempdir, '%s.batch%d.g.vcf.gz' % (args.temp_prefix, i))) job = sjm.Job('gatk_combine_gvcf-%s' % ogvcf.name) job.memory = "40G" job.output = ogvcf job.append('gatk_combine_gvcf.sh %s %s' % (job.output, " ".join(gvcf_batch))) jobs.append(job) return jobs
p = argparse.ArgumentParser(description='run_gatk.py -b tiny_b38.bam -o `pwd` --tmp /rgs01/scratch_space/ -r $ref_genome.gatk -j cap_tiny.sjm') p.add_argument('-b','--bam', metavar='STR', required=True, help='Support for aligned and dedupped BAMs as input') p.add_argument('-j', '--jobfile', metavar='FILE', help='The jobfile name (default: stdout)') p.add_argument('-o', '--output', metavar='DIR', required=True, help='The output directory, will be created if not present') p.add_argument('-r','--regions_file', metavar='FILE', required=True, help='A fiel that defines the regions of GATK parallele run') p.add_argument('-A','--account',metavar='STR', help='Account that were used to run the pipeline') p.add_argument('-T', '--tmp', metavar='DIR', required=True, help='The TMP directory for storing intermediate files, will be created if not exist (default=output directory') p.add_argument('--skip_realn_recal',action='store_true', help='Skip GATK relingment and recalibration') p.add_argument('--skip_recal', action='store_true', help='Skip GATK recalibration only') p.add_argument('--submit', action='store_true', help='Submit the jobs') args = p.parse_args() if args.jobfile is None: jobfile=None else: jobfile=util.File(args.jobfile) # set up directory outdir=util.Dir(args.output) logdir=util.Dir(outdir, 'log') tmpdir=outdir if args.tmp: tmpdir=util.Dir(args.tmp) tmpdir.mkdirs() outdir.mkdirs() sjm.Job.name_prefix="GATK"+"." sjm.Job.memory="20G" # default if not provided sjm.Job.queue="pcgp" sjm.Job.project="CompBio" if args.account: sjm.Job.sge_options="-A %s" % args.account tmpdir = getattr(__builtins__, 'str')(tmpdir)
p.add_argument('-q', '--queue', metavar='NAME', default="normal", help='Queue for jobs (default: normal)') p.add_argument('-t', '--threads', metavar='COUNT', type=int, default=4, help='Number of threads for BWA alignment, only works for SGE (default: 4)') p.add_argument('--account', metavar='STR', default="swang", help='Accounting string for the purpose of cluster accounting.') p.add_argument('--submit', action='store_true', help='Submit the jobs') args = p.parse_args() outdir=util.Dir(args.outdir) outdir.mkdirs() tmpdir=util.Dir(os.path.join(args.tmp, args.sm)) tmpdir.mkdirs() if args.jobfile is None: jobfile=None else: jobfile=util.File(args.jobfile) readgroup = "'@RG\\\\tID:%s\\\\tLB:%s\\\\tSM:%s\\\\tPL:%s'" % (args.id, args.lb, args.sm, args.pl) sjm.Job.name_prefix="BWA-mapping"+"." sjm.Job.memory="%sG"%args.memory sjm.Job.queue="pcgp" sjm.Job.project="CompBio" tmpdir = getattr(__builtins__, 'str')(tmpdir) outdir = getattr(__builtins__, 'str')(outdir) def align_pe(reads1, reads2): jobs=[] for i in range(0, len(reads1)): read1 = reads1[i]
def suggestor(filename, body): (old_module, old_symbol) = old_fullname.rsplit('.', 1) (new_module, new_symbol) = new_fullname.rsplit('.', 1) # We only need to operate on the old file (although we'll generate a # patch for the new one as well). Caller should ensure this but we # check to be safe. if filename != util.filename_for_module_name(old_module): return file_info = util.File(filename, body) # Find where old_fullname is defined in old_module. # TODO(csilvers): traverse try/except, for, etc, and complain # if we see the symbol defined inside there. # TODO(csilvers): look for ast.AugAssign and complain if our # symbol is in there. old_module_toplevel = util.toplevel_names(file_info) if old_symbol not in old_module_toplevel: raise khodemod.FatalError( filename, 0, "Could not find symbol '%s' in '%s': " "maybe it's in a try/finally or if?" % (old_symbol, old_module)) # Now get the startpos and endpos of this symbol's definition. node_to_move = old_module_toplevel[old_symbol] start, end = util.get_area_for_ast_node(node_to_move, file_info, include_previous_comments=True) definition_region = body[start:end] # Decide what text to add, which may require a rename. if old_symbol == new_symbol: new_definition_region = definition_region else: # Find the token with the name of the symbol, and update it. if isinstance(node_to_move, (ast.FunctionDef, ast.ClassDef)): for token in file_info.tokens.get_tokens(node_to_move): if token.string in ('def', 'class'): break else: raise khodemod.FatalError( filename, 0, "Could not find symbol '%s' in " "'%s': maybe it's defined weirdly?" % (old_symbol, old_module)) # We want the token after the def. name_token = file_info.tokens.next_token(token) else: # isinstance(node_to_move, ast.Assign) # The name should be a single token, if we get here. name_token, = list( file_info.tokens.get_tokens(node_to_move.targets[0])) if name_token.string != old_symbol: raise khodemod.FatalError( filename, 0, "Could not find symbol '%s' in " "'%s': maybe it's defined weirdly?" % (old_symbol, old_module)) new_definition_region = (body[start:name_token.startpos] + new_symbol + body[name_token.endpos:end]) if old_module == new_module: # Just patch the module in place. yield khodemod.Patch(filename, definition_region, new_definition_region, start, end) else: # Remove the region from the old file. # (If we've removed the remainder of the file, # _remove_empty_files_suggestor will clean up.) yield khodemod.Patch(filename, definition_region, '', start, end) # Add the region to the new file. new_filename = util.filename_for_module_name(new_module) new_file_body = khodemod.read_file(project_root, new_filename) or '' # Mess about with leading newlines. First, we strip any existing # ones. Then, if we are adding to an existing file, we add enough # to satisfy pep8. new_definition_region = new_definition_region.lstrip('\r\n') if new_file_body: current_newlines = (len(new_file_body) - len(new_file_body.rstrip('\r\n')) + len(new_definition_region) - len(new_definition_region.lstrip('\r\n'))) if current_newlines < 3: new_definition_region = ('\n' * (3 - current_newlines) + new_definition_region) # Now we need to add the new symbol to new_module. # TODO(benkraft): Allow, as an option, adding it after a specific # other symbol in new_module. yield khodemod.Patch(new_filename, '', new_definition_region, len(new_file_body), len(new_file_body)) # TODO(benkraft): Fix up imports in the new and old modules. new_filename = util.filename_for_module_name(new_module) for patch in _add_init_py(new_filename): yield patch
metavar='FILE', required=True, help='Final single vcf output file if choose joint call') g2.add_argument('--skip_realn_recal', action='store_true', help='Skip GATK relingment and recalibration') g2.add_argument('--skip_recal', action='store_true', help='Skip GATK recalibration only') p.add_argument('--submit', action='store_true', help='Submit the jobs') args = p.parse_args() if args.jobfile is None: jobfile = None else: jobfile = util.File(args.jobfile) outdir = util.Dir(args.output) logdir = util.Dir(outdir, 'log') tmpdir = outdir if args.tmp: tmpdir = util.Dir(args.tmp) tmpdir.mkdirs() outdir.mkdirs() sjm.Job.name_prefix = "GATK" + "." sjm.Job.memory = "20G" sjm.Job.queue = "pcgp" sjm.Job.project = "CompBio" if args.account: sjm.Job.sge_options = "-A %s" % args.account tmpdir = getattr(__builtins__, 'str')(tmpdir) outdir = getattr(__builtins__, 'str')(outdir)