def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for idx, row in design_df.iterrows(): nm = row['Short name'] for threshold in params['major_thresholds']: command = f'python {NAME}.py {nm} {threshold}' script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + f'q_{script_id}_{nm}_{threshold}.sh' with open(sh_fn, 'w') as f: f.write(f'#!/bin/bash\n{command}\n') num_scripts += 1 # Write qsub commands qsub_commands.append(f'qsub -V -P regevlab -l h_rt=10:00:00,h_vmem=4G -wd {_config.SRC_DIR} {sh_fn} &') # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output(f'chmod +x {commands_fn}', shell = True) print(f'Wrote {num_scripts} shell scripts to {qsubs_dir}') return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print 'Generating qsub scripts...' qsubs_dir = _config.QSUBS_DIR + NAME + '_510' + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for _nm in [ "190510Gif_D19-2120{0}".format(i) for i in range(26, 29) + range(35, 38) ]: for _split in range(15): command = '/cluster/shz24/anaconda3/envs/splice_env/bin/python %s.py %s %s' % ( NAME, _nm, _split) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, _nm, _split) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -m e -wd %s %s' % (_config.SRC_DIR, sh_fn)) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print 'Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print 'Generating qsub scripts...' qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for exp_p in exp_pairs: for exp in exp_p: command = 'python %s.py %s redo' % (NAME, exp) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s.sh' % (script_id, exp) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -m e -V -wd %s %s' % (_config.SRC_DIR, sh_fn)) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print 'Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print 'Generating qsub scripts...' qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for exp in [ 'VO_K562', 'VO_HCT116', 'VO_HEK293', 'Lib1-mES', 'Lib1-HCT116', 'Lib1-HEK293T', 'DisLib-U2OS', 'DisLib-mES', 'DisLib-HEK293T', 'DisLib-U2OS-HEK-Mixture', 'PRL-Lib1-mES', 'PRL-DisLib-mES' ]: command = 'python %s.py %s' % (NAME, exp) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s.sh' % (script_id, exp) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -m e -wd %s %s' % (_config.SRC_DIR, sh_fn)) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print 'Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir) return
def main(inp_dir, out_dir): print NAME util.ensure_dir_exists(out_dir) gather(inp_dir, out_dir) return
def demultiplex(split): inp_fn = inp_dir + '%s.fq' % (split) for name in list(exp_design['Name']) + ['other']: util.ensure_dir_exists(out_dir + name) util.exists_empty_fn(out_dir + name + '/%s.fa' % (split)) lc = util.line_count(inp_fn) num_bad_q, num_tot = 0, 0 timer = util.Timer(total = lc) with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: header = line.strip() if i % 4 == 1: read = line.strip() if i % 4 == 3: num_tot += 1 qs = line.strip() quals = [ord(s)-33 for s in qs] if np.mean(quals) < 30: num_bad_q += 1 continue demultiplex_id, trimmed_read = match(read, header) out_fn = out_dir + '%s/%s.fa' % (demultiplex_id, split) with open(out_fn, 'a') as f: f.write('>' + header[1:] + '\n' + trimmed_read + '\n') timer.update() print 'Rejected %s fraction of reads' % (num_bad_q / num_tot) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for nm in ill_nms: command = f'python {NAME}.py {nm}' script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + f'q_{script_id}_{nm}.sh' with open(sh_fn, 'w') as f: f.write(f'#!/bin/bash\n{command}\n') num_scripts += 1 # Write qsub commands qsub_commands.append( f'qsub -V -P regevlab -l h_rt=10:00:00 -wd {_config.SRC_DIR} {sh_fn} &' ) # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output(f'chmod +x {commands_fn}', shell=True) print(f'Wrote {num_scripts} shell scripts to {qsubs_dir}') return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print 'Generating qsub scripts...' qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] l2_names = _data.D['Name'] l3_names = _data.L3 num_scripts = 0 for nm in l2_names: command = 'python %s.py %s' % (NAME, nm) script_id = 'pre-l2' # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s.sh' % (script_id, nm) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -m e -V -wd %s %s' % (_config.SRC_DIR, sh_fn)) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print 'Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for condition in exp_design['Name']: exp_row = exp_design[exp_design['Name'] == condition].iloc[0] lib_nm = exp_row['Library'] command = f'python {NAME}.py {condition}' script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + f'q_{script_id}_{condition}.sh' with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append( 'qsub -j y -P regevlab -V -l h_rt=4:00:00,h_vmem=2G -wd %s %s &' % (_config.SRC_DIR, sh_fn)) # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output('chmod +x %s' % (commands_fn), shell=True) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def main(inp_dir, out_dir, srr_id='', start='none', end='none'): print NAME util.ensure_dir_exists(out_dir) # Function calls if srr_id == '' and start == 'none' and end == 'none': gen_qsubs() return if srr_id != '' and start == 'none' and end == 'none': if is_control(srr_id): print 'is control' return control_adjustment(inp_dir, out_dir, srr_id) return start, end = int(start), int(end) timer = util.Timer(total=end - start + 1) for idnum in range(start, end + 1): srr_id = 'SRR%s' % (idnum) ans = is_control(srr_id) if ans is False: control_adjustment(inp_dir, out_dir, srr_id) timer.update() return out_dir
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] pacbio_srrs = exp_design[exp_design['Instrument'] == 'PacBio RS II']['Run'] pacbio_nms = exp_design[exp_design['Instrument'] == 'PacBio RS II']['Library Name'] num_scripts = 0 for srr, nm in zip(pacbio_srrs, pacbio_nms): command = f'python {NAME}.py {srr} {nm}' script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + f'q_{script_id}_{srr}.sh' with open(sh_fn, 'w') as f: f.write(f'#!/bin/bash\n{command}\n') num_scripts += 1 # Write qsub commands qsub_commands.append(f'qsub -V -P regevlab -l h_rt=4:00:00 -wd {_config.SRC_DIR} {sh_fn} &') # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output(f'chmod +x {commands_fn}', shell = True) print(f'Wrote {num_scripts} shell scripts to {qsubs_dir}') return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print 'Generating nohup scripts...' qsubs_dir = _config.QSUBS_DIR + NAME + '/' w_dir = _config.SRC_DIR util.ensure_dir_exists(qsubs_dir) qsub_commands = [] curr_num = 0 num_scripts = 0 nums = {'exons': 36, 'introns': 32} for typ in nums: for split in range(nums[typ]): script_id = NAME.split('_')[0] command = 'python -u %s.py %s %s' % (NAME, typ, split) script_abbrev = NAME.split('_')[0] sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_abbrev, typ, split) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) curr_num += 1 # Write qsub commands qsub_commands.append('qsub -m e -wd %s %s' % (_config.SRC_DIR, sh_fn)) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print 'Wrote %s shell scripts to %s' % (curr_num, qsubs_dir) return
def main(nm='', start='', end=''): print NAME print nm if nm == '' and start == '' and end == '': gen_qsubs() return start, end = int(start), int(end) out_dir = out_place + nm + '/' util.ensure_dir_exists(out_dir) print 'Preparing alignment output directories...' prepare_align_outdirs(out_dir, start, end) print 'Done' global expected_cutsite expected_cutsite = len('TCCGTGCTGTAACGAAAGGATGGGTGCGACGCGTCAT') + 27 inp_dir = inp_place + nm + '/' timer = util.Timer(total=end - start + 1) for iter_exp in range(start, end + 1): data = defaultdict(list) for split in os.listdir(inp_dir): if split == 'aligns': continue inp_fn = inp_dir + '%s/%s.txt' % (split, iter_exp) remaster_aligns(inp_fn, data) save_alignments(data, out_dir, iter_exp) timer.update() return
def individualize(inp_dir, out_dir): # a_gather produces large dataframes of 2000 experiments concatenated together. # extracting dataframes for each individual experiment is slow, while it's faster to just read in individual csv's for each experiment. (This functions produces individual csv's). for inp_fn in os.listdir(inp_dir): if not fnmatch.fnmatch(inp_fn, '*csv'): continue # if inp_fn not in ['PRL-Lib1-mES.csv', 'PRL-DisLib-mES.csv', 'Lib1-mES.csv']: # continue inp_nm = inp_fn.replace('.csv', '') out_fold = out_dir + inp_nm + '/' util.ensure_dir_exists(out_fold) df = pd.read_csv(inp_dir + inp_fn) exps = set(df['Experiment']) print inp_nm timer = util.Timer(total=len(exps)) for exp in exps: out_fn = out_fold + '%s.csv' % (exp) d = df[df['Experiment'] == exp] d.to_csv(out_fn) timer.update() return
def main(nm='', start='', end=''): print(NAME) print(nm) start, end = int(start), int(end) out_dir = out_place + nm + '/' util.ensure_dir_exists(out_dir) print('Preparing alignment output directories...') nms = all_names[start:end + 1] prepare_align_outdirs(out_dir, nms) print('Done') global expected_cutsite expected_cutsite = len('GATGGGTGCGACGCGTCAT') + 28 inp_dir = inp_place + nm + '/' timer = util.Timer(total=len(nms)) for target_nm in nms: data = defaultdict(list) for split in os.listdir(inp_dir): if split == 'aligns': continue inp_fn = inp_dir + '%s/%s.txt' % (split, target_nm) remaster_aligns(inp_fn, data) save_alignments(data, out_dir, target_nm) timer.update() return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print 'Generating qsub scripts...' qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for idx in range(0, 10): command = 'python %s.py %s' % (NAME, idx) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s.sh' % (script_id, idx) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -m e -wd %s %s' % (_config.SRC_DIR, sh_fn)) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print 'Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir) return
def main(argv): print(NAME) modelexp_nm = argv[0] print(modelexp_nm) exp_design = pd.read_csv(_config.DATA_DIR + f'{modelexp_nm}.csv') hyperparam_cols = [col for col in exp_design.columns if col != 'Name'] new_out_dir = out_dir + f'{modelexp_nm}/' util.ensure_dir_exists(new_out_dir) print(f'Collating experiments...') model_out_dir = _config.OUT_PLACE + f'_fitness_from_reads_pt_multi/{modelexp_nm}/' num_fails = 0 timer = util.Timer(total = len(exp_design)) for idx, row in exp_design.iterrows(): int_nm = row['Name'] real_nm = row['dataset'] try: command = f'cp {model_out_dir}/model_{int_nm}/_final_fitness.csv {new_out_dir}/fitness_{int_nm}.csv' subprocess.check_output(command, shell = True) command = f'cp {model_out_dir}/model_{int_nm}/_final_genotype_matrix.csv {new_out_dir}/genotype_matrix_{int_nm}.csv' subprocess.check_output(command, shell = True) except: num_fails += 1 timer.update() print(f'Collated {len(exp_design)} experiments with {num_fails} failures') return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print 'Generating qsub scripts...' qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for idx in range(3696622, 3702820 + 1, 62): start = idx end = start + 61 command = 'python %s.py none %s %s' % (NAME, start, end) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s.sh' % (script_id, start) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -m e -wd %s %s' % (_config.SRC_DIR, sh_fn)) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print 'Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir) return
def main(inp_dir, out_dir, nm='none', start='none', end='none'): print NAME util.ensure_dir_exists(out_dir) if nm == 'none' and start == 'none' and end == 'none': gen_qsubs() return if nm != 'none' and start == 'none' and end == 'none': # Run single print nm res, context = set_master_expected_cutsite(nm) if res is False: return genotype_data(inp_dir, out_dir, nm, context) return # Run many start, end = int(start), int(end) timer = util.Timer(total=end - start + 1) for idnum in range(start, end + 1): srr_id = 'SRR%s' % (idnum) # print srr_id res, context = set_master_expected_cutsite(srr_id) if res is False: continue genotype_data(inp_dir, out_dir, srr_id, context) timer.update() return out_dir
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for bc in exp_design['Name']: if 'Cas9' in bc: continue for start_idx in range(0, 12000, 2000): command = 'python %s.py %s %s %s' % (NAME, bc, start_idx, start_idx + 1999) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, bc, start_idx) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -V -l h_rt=2:00:00,h_vmem=1G -wd %s %s &' % (_config.SRC_DIR, sh_fn)) # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output('chmod +x %s' % (commands_fn), shell = True) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 fns = [fn for fn in os.listdir(inp_dir) if '.fq' in fn] for fn in fns: command = 'python %s.py %s' % (NAME, fn) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s.sh' % (script_id, fn) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -V -l h_rt=2:00:00,h_vmem=1G -wd %s %s &' % (_config.SRC_DIR, sh_fn)) # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output('chmod +x %s' % (commands_fn), shell=True) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def main(inp_dir, out_dir): print NAME util.ensure_dir_exists(out_dir) individualize(inp_dir, out_dir) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for idx, row in treat_control_df.iterrows(): treat_nm = row['Treatment'] if 'ABE' in treat_nm or 'Cas9' in treat_nm: continue command = 'python %s.py %s' % (NAME, treat_nm) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s.sh' % (script_id, treat_nm) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -V -P regevlab -l h_rt=4:00:00,h_vmem=1G -wd %s %s &' % (_config.SRC_DIR, sh_fn)) # Save commands commands_fn = qsubs_dir + '_commands.sh' with open(commands_fn, 'w') as f: f.write('\n'.join(qsub_commands)) subprocess.check_output('chmod +x %s' % (commands_fn), shell = True) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print 'Generating qsub scripts...' qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 postcas9_nms = [ '052218_U2OS_+_LibA_postCas9_rep1', '052218_U2OS_+_LibA_postCas9_rep2' ] for bc in postcas9_nms: for split in range(0, 2000, 100): command = 'python %s.py %s %s' % (NAME, bc, split) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, bc, split) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -m e -V -wd %s %s' % (_config.SRC_DIR, sh_fn)) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print 'Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir) return
def prepare_align_outdirs(out_plc, nms): util.ensure_dir_exists(out_plc) timer = util.Timer(total=len(nms)) for exp in nms: out_idx_dir = out_plc + str(exp) + '/' util.ensure_dir_exists(out_idx_dir) if len(os.listdir(out_idx_dir)) > 0: subprocess.check_output('rm -rf %s*' % (out_idx_dir), shell=True) timer.update() return
def prepare_align_outdirs(out_plc, start, end): util.ensure_dir_exists(out_plc) timer = util.Timer(total = end - start + 1) for exp in range(start, end + 1): out_idx_dir = out_plc + 'SRR' + str(exp) + '/' util.ensure_dir_exists(out_idx_dir) if len(os.listdir(out_idx_dir)) > 0: subprocess.check_output('rm -rf %s*' % (out_idx_dir), shell = True) timer.update() return
def main(inp_dir, out_dir, run=True): print NAME util.ensure_dir_exists(out_dir) if not run: print '\tskipped' return out_dir # Function calls return out_dir
def main(inp_dir, out_dir, run = True): print NAME util.ensure_dir_exists(out_dir) if not run: print '\tskipped' return out_dir # Function calls return out_dir
def main(data_nm=''): print NAME global out_dir util.ensure_dir_exists(out_dir) import fi2_ins_ratio import fk_1bpins exps = [ 'VO-spacers-HEK293-48h-controladj', 'VO-spacers-K562-48h-controladj', 'DisLib-mES-controladj', 'DisLib-U2OS-controladj', 'Lib1-mES-controladj' ] all_rate_stats = pd.DataFrame() all_bp_stats = pd.DataFrame() for exp in exps: rate_stats = fi2_ins_ratio.load_statistics(exp) rate_stats = rate_stats[rate_stats['Entropy'] > 0.01] bp_stats = fk_1bpins.load_statistics(exp) exps = rate_stats['_Experiment'] if 'DisLib' in exp: crit = (rate_stats['_Experiment'] >= 73) & (rate_stats['_Experiment'] <= 300) rs = rate_stats[crit] all_rate_stats = all_rate_stats.append(rs, ignore_index=True) crit = (rate_stats['_Experiment'] >= 16) & (rate_stats['_Experiment'] <= 72) rs = rate_stats[crit] rs = rs[rs['Ins1bp Ratio'] < 0.3] # remove outliers all_rate_stats = all_rate_stats.append(rs, ignore_index=True) crit = (bp_stats['_Experiment'] >= 73) & (bp_stats['_Experiment'] <= 300) rs = bp_stats[crit] all_bp_stats = all_bp_stats.append(rs, ignore_index=True) crit = (bp_stats['_Experiment'] >= 16) & (bp_stats['_Experiment'] <= 72) rs = bp_stats[crit] all_bp_stats = all_bp_stats.append(rs, ignore_index=True) elif 'VO' in exp or 'Lib1' in exp: all_rate_stats = all_rate_stats.append(rate_stats, ignore_index=True) all_bp_stats = all_bp_stats.append(bp_stats, ignore_index=True) print exp, len(all_rate_stats) X, Y, Normalizer = featurize(all_rate_stats, 'Ins1bp/Del Ratio') generate_models(X, Y, all_bp_stats, Normalizer) return
def main(inp_dir, out_dir, srr_id=None): print NAME util.ensure_dir_exists(out_dir) # Function calls if srr_id is None: gen_qsubs() else: convert_alignment(srr_id, out_dir) return out_dir
def main(data_nm=''): print NAME global out_dir util.ensure_dir_exists(out_dir) # prepare_dataset_try1() # prepare_dataset_try2() prepare_dataset_try3() # prepare_dataset_try4() return
def main(data_nm='', redo_flag=''): print NAME global out_dir util.ensure_dir_exists(out_dir) if redo_flag == 'redo': global redo redo = True prepare_statistics() return
def main(inp_dir, out_dir, run = True): print NAME util.ensure_dir_exists(out_dir) if not run: print '\tskipped' return out_dir # Function calls inp_fn = DEFAULT_INP_DIR make_db(inp_fn, out_dir) return out_dir
import sys, os, fnmatch, datetime, subprocess, imp sys.path.append('/cluster/mshen/') import numpy as np from collections import defaultdict from mylib import util import pandas as pd import matplotlib matplotlib.use('Pdf') import matplotlib.pyplot as plt import seaborn as sns # Default params inp_dir = _config.DATA_DIR NAME = util.get_fn(__file__) out_dir = _config.OUT_PLACE + NAME + '/' util.ensure_dir_exists(out_dir) ## # Functions ## ## # qsub ## def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print 'Generating qsub scripts...' qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = []