def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] # Generate qsubs only for unfinished jobs treat_control_df = pd.read_csv(_config.DATA_DIR + 'treatment_control_design.csv', index_col=0) num_scripts = 0 for idx, row in treat_control_df.iterrows(): treat_nm = row['Treatment'] if 'Cas9' in treat_nm: continue lib_nm = _data.get_lib_nm(treat_nm) if lib_nm == 'LibA': num_targets = 2000 num_targets_per_split = 200 elif lib_nm == 'CtoGA': num_targets = 4000 num_targets_per_split = 500 else: num_targets = 12000 num_targets_per_split = 2000 for start_idx in range(0, num_targets, num_targets_per_split): end_idx = start_idx + num_targets_per_split - 1 # Skip completed out_pkl_fn = out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx, end_idx) if os.path.isfile(out_pkl_fn): if os.path.getsize(out_pkl_fn) > 0: continue command = 'python %s.py %s %s %s' % (NAME, treat_nm, start_idx, end_idx) script_id = NAME.split('_')[0] try: mb_file_size = _data.check_file_size(treat_nm, 'ag5a4_profile_subset') except FileNotFoundError: mb_file_size = 0 ram_gb = 2 if mb_file_size > 140: ram_gb = 4 if mb_file_size > 400: ram_gb = 8 if mb_file_size > 1000: ram_gb = 16 # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, treat_nm, start_idx) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append( 'qsub -V -P regevlab -l h_rt=4:00:00,h_vmem=%sG -wd %s %s &' % (ram_gb, _config.SRC_DIR, sh_fn)) # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output('chmod +x %s' % (commands_fn), shell=True) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] # Generate qsubs only for unfinished jobs treat_control_df = pd.read_csv(_config.DATA_DIR + 'treatment_control_design.csv', index_col = 0) num_scripts = 0 for idx, row in treat_control_df.iterrows(): treat_nm, control_nm = row['Treatment'], row['Control'] if os.path.exists(out_dir + '%s.pkl' % (treat_nm)): if os.path.getsize(out_dir + '%s.pkl' % (treat_nm)) > 0: continue command = 'python %s.py %s %s' % (NAME, treat_nm, control_nm) script_id = NAME.split('_')[0] ''' Empirically determined pickle > 37 mb: needs 4 gb ram pickle > 335 mb: needs 8 gb ram ''' print(treat_nm) mb_file_size = _data.check_file_size(treat_nm, 'h6_anyindel') ram_gb = 2 if mb_file_size > 30: ram_gb = 4 if mb_file_size > 300: ram_gb = 8 if mb_file_size > 1000: ram_gb = 16 ''' Can be very slow - up to 8h+ for some conditions. Could help to split 3 steps into 3 scripts. Statistical tests should be performed globally (for accurate FDR thresholds), and luckily these are the fast parts of the pipeline Subtracting control from treatment involves a lot of dataframe manipulations and is the bottleneck step. Fortunately, this can be parallelized ''' # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, treat_nm, control_nm) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -V -P regevlab -l h_rt=16:00:00,h_vmem=%sG -wd %s %s &' % (ram_gb, _config.SRC_DIR, sh_fn)) # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output('chmod +x %s' % (commands_fn), shell = True) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return