def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for fn in os.listdir(inp_dir): basename, rnum, snum = re.compile("(.*)_R(\d)_001_(\d+).fastq").search( fn).groups() if int(rnum) == 1: command = '/cluster/bh0085/anaconda27/envs/py3/bin/python %s.py %s %s' % ( NAME, snum, basename) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, basename, snum) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -m e -wd %s %s' % (_config.SRC_DIR, sh_fn)) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for split in range(N_SPLITS): for bc_substr_idx in range(0, bc_substr_max, bc_substr_inc): command = './%s.py %s %s' % (NAME, split, bc_substr_idx) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + f'q_{script_id}_{bc_substr_idx}_{split}.sh' with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append(f'qsub -m e -wd {_config.SRC_DIR} {sh_fn}') # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print(f'Wrote {num_scripts} shell scripts to {qsubs_dir}') return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 for k, exp in SEQUENCING_INFO.iterrows(): bc = exp.Name for start in range(0, 1): end = len(LIBRARY_DF) command = '/cluster/bh0085/anaconda27/envs/py3/bin/python %s.py %s %s %s' % ( NAME, bc, start, end) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, bc, start) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands qsub_commands.append('qsub -m e -wd %s %s' % (_config.SRC_DIR, sh_fn)) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = os.path.join(QSUBS_DIR, NAME + '/') util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 n_oligos_perbatch = 10 for start in range(0, max(oligo_lib.index), n_oligos_perbatch): end = start + n_oligos_perbatch command = f'./{NAME}.py {start} {end}' script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + f'q_{script_id}_{start}.sh' with open(sh_fn, 'w') as f: f.write(f'#!/bin/bash\n{command}\n') num_scripts += 1 # Write qsub commands qsub_commands.append(f'qsub -m e -wd {_config.SRC_DIR} {sh_fn}') # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print(f'Wrote {num_scripts} shell scripts to {qsubs_dir}') return
def main(nm='', start='', end=''): if nm == '' and start == '' and end == '': gen_qsubs() return start, end = int(start), int(end) out_dir = out_place + nm + '/' util.ensure_dir_exists(out_dir) print('Preparing alignment output directories...') prepare_align_outdirs(out_dir, start, end) print('Done') global expected_cutsite expected_cutsite = 30 inp_dir = inp_place + nm + '/' timer = util.Timer(total=end - start + 1) for iter_exp in range(start, end): data = defaultdict(list) for split in os.listdir(inp_dir): if split == 'aligns': continue inp_fn = inp_dir + '%s/%s.txt' % (split, iter_exp) remaster_aligns(inp_fn, data) save_alignments(data, out_dir, iter_exp) timer.update() return
def prepare_align_outdirs(out_plc, start, end): util.ensure_dir_exists(out_plc) timer = util.Timer(total=end - start + 1) for exp in range(start, end + 1): out_idx_dir = out_plc + str(exp) + '/' util.ensure_dir_exists(out_idx_dir) if len(os.listdir(out_idx_dir)) > 0: subprocess.check_output('rm -rf %s*' % (out_idx_dir), shell=True) timer.update() return
def main(inp_dir, out_dir, nm='', start='', end=''): print(NAME) inp_dir = '%s%s/' % (inp_dir, nm) util.ensure_dir_exists(out_dir) # Function calls if nm == '' and start == '' and end == '': gen_qsubs() return genotype_data(inp_dir, out_dir, nm, start, end) return out_dir
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = _config.QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 runtime = int(datetime.datetime.now().timestamp()) for fn in os.listdir(inp_dir): basename, rnum, snum = re.compile("(.*)_R(\d)_001_(\d+).fastq").search( fn).groups() print(basename, rnum, snum) print(inp_dir) if int(rnum) == 1: command = '/cluster/bh0085/anaconda27/envs/py3/bin/python %s.py %s %s' % ( NAME, snum, basename) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, basename, snum) with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands outfile = os.path.join(logs_dir, f"o_{runtime}_{basename}_{snum}.txt") errorfile = os.path.join(logs_dir, f"e_{runtime}_{basename}_{snum}.txt") qsub_commands.append( f'qsub -o {outfile} -e {errorfile} -wd {_config.SRC_DIR} {sh_fn}' ) # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir)) return
def gen_qsubs(): # Generate qsub shell scripts and commands for easy parallelization print('Generating qsub scripts...') qsubs_dir = QSUBS_DIR + NAME + '/' util.ensure_dir_exists(qsubs_dir) qsub_commands = [] num_scripts = 0 runtime = int(datetime.datetime.now().timestamp()) for exp in EXP_NAMES: command = '/cluster/bh0085/anaconda27/envs/py3/bin/python %s.py %s' % ( NAME, exp) script_id = NAME.split('_')[0] # Write shell scripts sh_fn = qsubs_dir + f'q_{script_id}_{exp}.sh' with open(sh_fn, 'w') as f: f.write('#!/bin/bash\n%s\n' % (command)) num_scripts += 1 # Write qsub commands # Write qsub commands outfile = os.path.join(logs_dir, f"o_{runtime}_{exp}.txt") errorfile = os.path.join(logs_dir, f"e_{runtime}_{exp}.txt") qsub_commands.append( f'qsub -o {outfile} -e {errorfile} -wd {_config.SRC_DIR} {sh_fn}') # Save commands with open(qsubs_dir + '_commands.txt', 'w') as f: f.write('\n'.join(qsub_commands)) print(f'Wrote {num_scripts} shell scripts to {qsubs_dir}') return
## import pandas as pd import numpy as np import itertools as it import os, re, sys from collections import defaultdict sys.path.append("/cluster/bh0085") from mybio import util from _config import DATA_DIR, OUT_PLACE, N_SPLITS, QSUBS_DIR, OLIGO_LIBRARY, POSITIVE_CONTROLS_FILE, EXP_NAMES import _config #IO DIRECTORY CONFIG NAME = util.get_fn(__file__) OUT_DIR = os.path.join(OUT_PLACE, NAME) util.ensure_dir_exists(OUT_DIR) ## # CUSTOM CODE ## #load oligo library from experimental design oligos_lib = OLIGO_LIBRARY oligos_lib["id"] = oligos_lib.index positive_controls = pd.read_csv(POSITIVE_CONTROLS_FILE) from _config import DATA_DIR, OUT_PLACE, N_SPLITS, QSUBS_DIR TX_INP_DIR = os.path.join(OUT_PLACE, "c0_bin_transcript_umis") OLIGO_INP_DIR = os.path.join(OUT_PLACE, "b1_demultiplex_oligos") #load oligo library from experimental design
import _config import sys, os, fnmatch, datetime, subprocess, imp import numpy as np from collections import defaultdict sys.path.append('/cluster/bh0085/') from mybio import util import pandas as pd import gzip # Default params inp_dirs = [_config.SHE2955_DIR, _config.SHE3447_DIR] NAME = util.get_fn(__file__) out_dir = _config.OUT_PLACE + NAME + '/' util.ensure_dir_exists(out_dir) ## # Functions ## def split(inp_fn, out_nm): inp_fn_numlines = util.line_count(inp_fn) num_splits = 30 split_size = int(inp_fn_numlines / num_splits) if num_splits * split_size < inp_fn_numlines: split_size += 1 while split_size % 4 != 0: split_size += 1 print('Using split size %s' % (split_size))
import _config from scipy.stats import entropy import scipy.stats as stats import matplotlib.pyplot as plt #%matplotlib inline DEFAULT_INP_DIR = _config.OUT_PLACE + 'f1_agg_reports/' print(DEFAULT_INP_DIR) if not "__file__" in vars(): __file__="f_test" NAME = util.get_fn(__file__) FIG_DIR = os.path.join(FIGS_PLACE, NAME) util.ensure_dir_exists(FIG_DIR) agg_results = pd.read_csv(os.path.join(DEFAULT_INP_DIR,f"{PRJ_NAME}_aggregate_stats.csv"),index_col=0).T lib_results = pd.read_csv(os.path.join(DEFAULT_INP_DIR,f"{PRJ_NAME}_library_stats.csv"),index_col=0).T f,subs = plt.subplots(1,3) f.set_size_inches(12,4) plt.sca(subs[0]) ax = plt.gca() plt.bar(lib_results.index,lib_results.frac_crispr_of_mapped) ax.set_title("CRISPR cutting efficiency\n(Fraction non-wildtype reads)") ax.set_ylabel("cutting efficiency") ax.set_xlabel("library ID") ax.set_xticks(lib_results[::10].index)
def demultiplex(split, filename): if "AH3W5GBGX9" in filename: print() exp_design = exp_design_2955 exp_test_strs = exp_test_strs_2955 else: exp_design = exp_design_3447 exp_test_strs = exp_test_strs_3447 for name in list(exp_design["Name"]) + ['other']: util.ensure_dir_exists(os.path.join(out_dir, '%s' % (filename), name)) util.exists_empty_fn( os.path.join(out_dir, '%s/%s/R1_%s.fa' % (filename, name, split))) util.exists_empty_fn( os.path.join(out_dir, '%s/%s/R2_%s.fa' % (filename, name, split))) print(os.path.join(out_dir, name, '%s' % (filename))) for snum, sgroup in it.groupby(sorted( os.listdir(inp_dir), key=lambda x: re.compile("(\d+)\.fastq").search(x).groups()[0]), key=lambda x: re.compile("(\d+)\.fastq"). search(x).groups()[0]): if snum != split: continue files = list(sgroup) fns = list([sf for sf in files if filename in sf]) print(("LANE: {0}, FILES: {1}".format(snum, fns))) read_files = dict([[int(re.compile("R(\d+)").search(e).group(1)), e] for e in fns]) inp_fn1 = os.path.join(inp_dir, read_files[1]) inp_fn2 = os.path.join(inp_dir, read_files[2]) lc = util.line_count(inp_fn1) num_bad_q, num_tot, num_other, num_mapped = 0, 0, 0, 0 timer = util.Timer(total=lc) i = -1 ## # Functions ## def match(r1, r2, h1, h2): for k, v in list(exp_test_strs.items()): try: idx = h1.index(v) return k, r1 except ValueError as e: continue return "other", r1 with open(inp_fn1) as f1: with open(inp_fn2) as f2: print(inp_fn1) print(inp_fn2) while 1: i += 1 if i % 10000 == 0: print(( "{0} records, ({1}%) [{2} bad] [{3} other]".format( i / 4, 100 * float(i) / lc, num_bad_q, num_other))) try: line1 = next(f1) line2 = next(f2) except StopIteration as e: break if i % 4 == 0: h1 = line1.strip() h2 = line2.strip() if i % 4 == 1: r1 = line1.strip() r2 = line2.strip() if i % 4 == 3: num_tot += 1 qs1 = line1.strip() qs2 = line2.strip() markbad = False for qs in [qs1, qs2]: quals = [ord(s) - 33 for s in qs] if np.mean(quals) < 30: markbad = True if markbad: num_bad_q += 1 continue demultiplex_id, trimmed_read = match(r1, r2, h1, h2) if demultiplex_id == 'other': num_other += 1 out1_fn = out_dir + '%s/%s/R1_%s.fa' % ( filename, demultiplex_id, split) if len(('>' + h1[1:] + '\n' + r1 + '\n').splitlines()) > 2: print('>' + h1[1:] + '\n' + r1 + '\n') raise Exception() #print('>' + h1[1:] + '\n' + r1 + '\n') with open(out1_fn, 'a') as f: f.write('>' + h1[1:] + '\n' + r1 + '\n') out2_fn = out_dir + '%s/%s/R2_%s.fa' % ( filename, demultiplex_id, split) with open(out2_fn, 'a') as f: f.write('>' + h2[1:] + '\n' + r2 + '\n') num_mapped += 1 #timer.update() #logs = pd.Series({"num_bad_q":num_bad_q, # "num_tot":num_tot}) #logs.to_csv(os.path.join(LOGS_DIR,f"{datetime.date.today().isoformat}_{filename}_{split}.csv")) print(('Rejected %s fraction of reads' % (num_bad_q / num_tot))) print("<json>" + json.dumps({ "num_bad_q": num_bad_q, "num_tot": num_tot, "num_other": num_other, "num_mapped": num_mapped, }) + "</json>") return
import _config import sys, os, fnmatch, datetime, subprocess sys.path.append('/cluster/bh0085/') import numpy as np from collections import defaultdict from mybio import util import pickle from _config import LIBRARY_DF, SEQUENCING_INFO # Default params inp_place = _config.OUT_PLACE + 'b_alignment_rev/' NAME = util.get_fn(__file__) out_place = _config.OUT_PLACE + NAME + '/' util.ensure_dir_exists(out_place) global expected_cutsite ## # Left or right ## def left_or_right_event(seq, cutsite): # Given a sequence with exactly one - event and possible end gaps, determine if it's on the left or right of the cutsite. left_test = len(seq[:cutsite + 1].replace('-', ' ').split()) right_test = len(seq[cutsite - 1:].replace('-', ' ').split()) if left_test == 2 and right_test == 1: return 'left' if left_test == 1 and right_test == 2: return 'right'
def matchmaker(nm, split): print(nm, split) stdout_fn = os.path.join(_config.LOGS_DIR, 'nh_c_%s_%s.out' % (nm, split)) util.exists_empty_fn(stdout_fn) out_dir = os.path.join(out_root_dir, nm, split) util.ensure_dir_exists(out_dir) inp_fn = inp_dir + '%s_R2_%s.fastq' % (nm, split) lsh_dict = build_targets_better_lsh() alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir) qf = 0 print(inp_fn) tot_reads = util.line_count(inp_fn) timer = util.Timer(total=tot_reads) with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: pass if i % 4 == 1: l2 = line.strip() if i % 4 == 3: # Quality filter q2 = line.strip() qs = [ord(s) - 33 for s in q2] if np.mean(qs) < 28: qf += 1 continue #l2 = compbio.reverse_complement(l2) #l2 = l2[82] # -- note, changed from :61 to 61:. Can comment out entirely? l2 = reverse_complement(l2) #l2 = l2[-62:] align_header = '>1' # Try to find designed target from LSH cand_idxs = find_best_designed_target(l2, lsh_dict) if len(cand_idxs) == 0: continue # Run alignment best_idx, align = alignment(l2, cand_idxs) align = align.decode("utf-8") # Store alignment into buffer store_alignment(alignment_buffer, best_idx, align_header, align) if i % int(tot_reads / 100) == 1 and i > 1: # Flush alignment buffer alignment_buffer = flush_alignments(alignment_buffer, out_dir) # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_reads / 100))) outf.write('Quality filtered pct: %s\n' % (qf / (i / 4))) #timer.update() # Final flush alignment_buffer = flush_alignments(alignment_buffer, out_dir) return