Пример #1
0
def gen_qsubs():
    # Generate qsub shell scripts and commands for easy parallelization
    print('Generating qsub scripts...')
    qsubs_dir = _config.QSUBS_DIR + NAME + '/'
    util.ensure_dir_exists(qsubs_dir)
    qsub_commands = []

    # Generate qsubs only for unfinished jobs
    treat_control_df = pd.read_csv(_config.DATA_DIR +
                                   'treatment_control_design.csv',
                                   index_col=0)

    num_scripts = 0
    for idx, row in treat_control_df.iterrows():
        treat_nm = row['Treatment']
        if 'Cas9' in treat_nm:
            continue
        lib_nm = _data.get_lib_nm(treat_nm)
        if lib_nm == 'LibA':
            num_targets = 2000
            num_targets_per_split = 200
        elif lib_nm == 'CtoGA':
            num_targets = 4000
            num_targets_per_split = 500
        else:
            num_targets = 12000
            num_targets_per_split = 2000

        for start_idx in range(0, num_targets, num_targets_per_split):
            end_idx = start_idx + num_targets_per_split - 1

            # Skip completed
            out_pkl_fn = out_dir + '%s_%s_%s.pkl' % (treat_nm, start_idx,
                                                     end_idx)
            if os.path.isfile(out_pkl_fn):
                if os.path.getsize(out_pkl_fn) > 0:
                    continue

            command = 'python %s.py %s %s %s' % (NAME, treat_nm, start_idx,
                                                 end_idx)
            script_id = NAME.split('_')[0]

            try:
                mb_file_size = _data.check_file_size(treat_nm,
                                                     'ag5a4_profile_subset')
            except FileNotFoundError:
                mb_file_size = 0
            ram_gb = 2
            if mb_file_size > 140:
                ram_gb = 4
            if mb_file_size > 400:
                ram_gb = 8
            if mb_file_size > 1000:
                ram_gb = 16

            # Write shell scripts
            sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, treat_nm,
                                                   start_idx)
            with open(sh_fn, 'w') as f:
                f.write('#!/bin/bash\n%s\n' % (command))
            num_scripts += 1

            # Write qsub commands
            qsub_commands.append(
                'qsub -V -P regevlab -l h_rt=4:00:00,h_vmem=%sG -wd %s %s &' %
                (ram_gb, _config.SRC_DIR, sh_fn))

    # Save commands
    commands_fn = qsubs_dir + '_commands.sh'
    with open(commands_fn, 'w') as f:
        f.write('\n'.join(qsub_commands))

    subprocess.check_output('chmod +x %s' % (commands_fn), shell=True)

    print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir))
    return
Пример #2
0
def gen_qsubs():
  # Generate qsub shell scripts and commands for easy parallelization
  print('Generating qsub scripts...')
  qsubs_dir = _config.QSUBS_DIR + NAME + '/'
  util.ensure_dir_exists(qsubs_dir)
  qsub_commands = []

  # Generate qsubs only for unfinished jobs
  treat_control_df = pd.read_csv(_config.DATA_DIR + 'treatment_control_design.csv', index_col = 0)

  num_scripts = 0
  for idx, row in treat_control_df.iterrows():
    treat_nm, control_nm = row['Treatment'], row['Control']

    if os.path.exists(out_dir + '%s.pkl' % (treat_nm)):
      if os.path.getsize(out_dir + '%s.pkl' % (treat_nm)) > 0:
        continue

    command = 'python %s.py %s %s' % (NAME, treat_nm, control_nm)
    script_id = NAME.split('_')[0]

    '''
      Empirically determined
      pickle > 37 mb: needs 4 gb ram
      pickle > 335 mb: needs 8 gb ram
    '''
    print(treat_nm)
    mb_file_size = _data.check_file_size(treat_nm, 'h6_anyindel')
    ram_gb = 2
    if mb_file_size > 30:
      ram_gb = 4
    if mb_file_size > 300:
      ram_gb = 8
    if mb_file_size > 1000:
      ram_gb = 16

    '''
      Can be very slow - up to 8h+ for some conditions.

      Could help to split 3 steps into 3 scripts.
      Statistical tests should be performed globally (for accurate FDR thresholds), and luckily these are the fast parts of the pipeline

      Subtracting control from treatment involves a lot of dataframe manipulations and is the bottleneck step. Fortunately, this can be parallelized
    '''

    # Write shell scripts
    sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, treat_nm, control_nm)
    with open(sh_fn, 'w') as f:
      f.write('#!/bin/bash\n%s\n' % (command))
    num_scripts += 1

    # Write qsub commands
    qsub_commands.append('qsub -V -P regevlab -l h_rt=16:00:00,h_vmem=%sG -wd %s %s &' % (ram_gb, _config.SRC_DIR, sh_fn))

  # Save commands
  commands_fn = qsubs_dir + '_commands.sh'
  with open(commands_fn, 'w') as f:
    f.write('\n'.join(qsub_commands))

  subprocess.check_output('chmod +x %s' % (commands_fn), shell = True)

  print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir))
  return