def __call__(self, target_dir): # do some STUFF d = Build38RealignmentDirectory(target_dir) validator = B38DirectoryValidator(d) if validator.valid_directory() or self.force: logger.info('Directory valid for processing') outdir = self.output_directory(d) logger.info('Output directory is {0}'.format(outdir)) utils.force_make_dirs(outdir) stdout_dir = os.path.join(self.logdir, d.sample_name()) utils.force_make_dirs(stdout_dir) # always submit a CRAM transfer because we use rsync # and it checks these things... copy_stdout = os.path.join(stdout_dir, 'cram_copy.log') cram_shortcutter = Shortcutter(d, outdir, '.cram_file_md5s.json', lambda x: x.cram_files()) cram, crai = d.cram_files() new_cram = os.path.basename(cram) output_cram = os.path.join(outdir, new_cram) output_crai = output_cram + '.crai' if not (cram_shortcutter.can_shortcut(cram, output_cram) and cram_shortcutter.can_shortcut(crai, output_crai)): cram_copy_cmd = RsyncCmd() cram_copy_cmdline = cram_copy_cmd(d.cram_file(), outdir) script_file = os.path.join(stdout_dir, 'cram_copy.sh') with open(script_file, 'w') as f: f.write(cram_copy_cmdline + "\n") self.lsf_job_runner.launch(['/bin/bash', script_file], {'stdout': copy_stdout}) shortcutter = Shortcutter(d, outdir, '.gvcf_file_md5s.json', lambda x: x.all_gvcf_files()) for gvcf in d.all_gvcf_files(): new_gzvcf = os.path.basename(gvcf) output_gzvcf = os.path.join(outdir, new_gzvcf) if not shortcutter.can_shortcut(gvcf, output_gzvcf): cmd = RewriteGvcfCmd( reference= '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa', ) cmdline = cmd(gvcf, output_gzvcf) script_file = os.path.join(stdout_dir, new_gzvcf + '.sh') with open(script_file, 'w') as f: f.write(cmdline + "\n") stdout = os.path.join(stdout_dir, new_gzvcf + '.log') lsf_options = { 'stdout': stdout, } self.lsf_job_runner.launch(['/bin/bash', script_file], lsf_options) # Sync QC files qc_outdir = os.path.join(outdir, 'qc') utils.force_make_dirs(qc_outdir) self._qc_files(d, qc_outdir, stdout_dir) return outdir else: logger.warn('Invalid for processing') return None
def __call__(self, target_dir): # do some STUFF d = Build38RealignmentDirectory(target_dir) validator = B38DirectoryValidator(d) if validator.valid_directory() or self.force: logger.info('Directory valid for processing') outdir = self.output_directory(d) logger.info('Output directory is {0}'.format(outdir)) utils.force_make_dirs(outdir) stdout_dir = os.path.join(self.logdir, d.sample_name()) utils.force_make_dirs(stdout_dir) # always submit a CRAM transfer because we use rsync # and it checks these things... copy_stdout = os.path.join(stdout_dir, 'cram_copy.log') cram_shortcutter = Shortcutter(d, outdir, '.cram_file_md5s.json', lambda x: x.cram_files()) cram, crai = d.cram_files() new_cram = os.path.basename(cram) output_cram = os.path.join(outdir, new_cram) output_crai = output_cram + '.crai' if not (cram_shortcutter.can_shortcut(cram, output_cram) and cram_shortcutter.can_shortcut(crai, output_crai)): cram_copy_cmd = RsyncCmd() cram_copy_cmdline = cram_copy_cmd(d.cram_file(), outdir) self.lsf_job_runner.launch(cram_copy_cmdline, {'stdout': copy_stdout}) shortcutter = Shortcutter(d, outdir, '.gvcf_file_md5s.json', lambda x: x.all_gvcf_files()) for gvcf in d.all_gvcf_files(): new_gzvcf = os.path.basename(gvcf) output_gzvcf = os.path.join(outdir, new_gzvcf) if not shortcutter.can_shortcut(gvcf, output_gzvcf): cmd = RewriteGvcfCmd( java='/gapp/x64linux/opt/java/jdk/jdk1.8.0_60/bin/java', max_mem='3500M', max_stack='3500M', gatk_jar= '/gscmnt/gc2802/halllab/ccdg_resources/lib/GenomeAnalysisTK-3.5-0-g36282e4.jar', reference= '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa', break_multiple=1000000) cmdline = cmd(gvcf, output_gzvcf) stdout = os.path.join(stdout_dir, new_gzvcf + '.log') lsf_options = { 'stdout': stdout, } self.lsf_job_runner.launch(cmdline, lsf_options) # Sync QC files qc_outdir = os.path.join(outdir, 'qc') utils.force_make_dirs(qc_outdir) self._qc_files(d, qc_outdir, stdout_dir) return outdir else: logger.warn('Invalid for processing') return None
def call_svs(app, workorders): Session = open_db(app.database) for workorder in workorders: session = Session() for sample in session.query(ComputeWorkflowSample).filter( ComputeWorkflowSample.source_work_order == workorder): if (sample.analysis_cram_verifyed and sample.analysis_sv_verified != True): if sample.analysis_sv_path is None: sample.analysis_sv_path = os.path.join( sample.analysis_cram_path, 'sv') directory = AnalysisDirectory(sample.analysis_gvcf_path) cram_file = directory.output_file_dict['*.cram'][0] filename = os.path.basename(cram_file) sample_name = filename.split('.cram')[0] sv_directory = AnalysisSvDirectory(sample.analysis_sv_path) complete = True if not sv_directory.staging_complete(): # stage directory force_make_dirs(sample.analysis_sv_path) force_symlink( cram_file, os.path.join(sample.analysis_sv_path, filename)) force_symlink( cram_file + '.crai', os.path.join(sample.analysis_sv_path, filename + '.crai')) os.chdir(sample.analysis_sv_path) if not sv_directory.cnvnator_complete(): # launch cnvnator complete = False print( subprocess.check_output([ '/bin/bash', '/gscuser/dlarson/src/internal-sv-pipeline/cnvnator_histogram.sh', filename ])) if not sv_directory.extract_complete(): # launch complete = False print( subprocess.check_output([ '/bin/bash', '/gscuser/dlarson/src/internal-sv-pipeline/extract_sv_reads.sh', filename ])) elif not sv_directory.lumpy_complete(): # launch complete = False subprocess.call([ '/bin/bash', '/gscuser/dlarson/src/internal-sv-pipeline/lumpy.sh', filename ]) elif not sv_directory.svtyper_complete(): complete = False subprocess.call([ '/bin/bash', '/gscuser/dlarson/src/internal-sv-pipeline/genotype.sh', filename ]) sample.analysis_sv_verified = complete session.commit() if complete: logger.info("{0} complete".format(sample_name)) session.close()
def oldband(app, output_dir, workorders): os.environ['LSF_NO_INHERIT_ENVIRONMENT'] = 'true' default_job_options = { 'memory_in_gb': 10, 'queue': app.queue, 'docker': 'registry.gsc.wustl.edu/genome/gatk-3.5-0-g36282e4:1', } if app.job_group is not None: default_job_options['group'] = app.job_group job_runner = LsfJob(default_job_options) logdir = os.path.join(output_dir, 'log') Session = open_db(app.database) cmd = OldbandandRewriteGvcfCmd( java='/usr/bin/java', max_mem='8G', max_stack='8G', gatk_jar='/opt/GenomeAnalysisTK.jar', reference= '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa', break_multiple=1000000) for wo in workorders: session = Session() for sample in session.query(ComputeWorkflowSample).filter( ComputeWorkflowSample.source_work_order == wo): if (sample.analysis_cram_verifyed): cram_path = sample.analysis_cram_path sample_name = os.path.basename(cram_path) cram_file = os.path.join(sample.analysis_cram_path, '{}.cram'.format(sample_name)) oldband_path = os.path.join(sample.analysis_gvcf_path, 'oldbanded_gvcfs') force_make_dirs(oldband_path) stdout_dir = os.path.join(logdir, sample_name) for chrom in chromosomes: new_gzvcf = '{0}.{1}.g.vcf.gz'.format(sample_name, chrom) output_gzvcf = os.path.join(oldband_path, new_gzvcf) if not os.path.exists(output_gzvcf) or not os.path.exists( output_gzvcf + '.tbi'): stdout = os.path.join(stdout_dir, new_gzvcf + '.oldbanded.log') cmdline = cmd(cram_file, output_gzvcf, chrom) lsf_options = { 'stdout': stdout, } job_runner.launch(cmdline, lsf_options) # do ext chrom_string = ' -L '.join(ext_chromosomes) new_gzvcf = '{0}.extChr.g.vcf.gz'.format(sample_name) output_gzvcf = os.path.join(oldband_path, new_gzvcf) if not os.path.exists(output_gzvcf) or not os.path.exists( output_gzvcf + '.tbi'): script = os.path.join(oldband_path, 'oldband_extChr.sh') cmdline = cmd(cram_file, output_gzvcf, chrom_string) cmdline += ' && rm -f {0}'.format(script) with open(script, 'w') as f: f.write('#!/bin/bash\n') f.write(cmdline) f.write('\n') stdout = os.path.join(stdout_dir, new_gzvcf + '.oldbanded.log') lsf_options = { 'stdout': stdout, } job_runner.launch('/bin/bash {0}'.format(script), lsf_options)
def __init__(self, dest_dir, job_runner, force=False): self.dest_dir = dest_dir self.logdir = os.path.join(self.dest_dir, 'log') self.lsf_job_runner = job_runner self.force = force utils.force_make_dirs(self.logdir)