def demultiplex(split): inp_fn = inp_dir + '%s.fq' % (split) for name in list(exp_design['Name']) + ['other']: util.ensure_dir_exists(out_dir + name) util.exists_empty_fn(out_dir + name + '/%s.fa' % (split)) lc = util.line_count(inp_fn) num_bad_q, num_tot = 0, 0 timer = util.Timer(total = lc) with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: header = line.strip() if i % 4 == 1: read = line.strip() if i % 4 == 3: num_tot += 1 qs = line.strip() quals = [ord(s)-33 for s in qs] if np.mean(quals) < 30: num_bad_q += 1 continue demultiplex_id, trimmed_read = match(read, header) out_fn = out_dir + '%s/%s.fa' % (demultiplex_id, split) with open(out_fn, 'a') as f: f.write('>' + header[1:] + '\n' + trimmed_read + '\n') timer.update() print 'Rejected %s fraction of reads' % (num_bad_q / num_tot) return
def prepare_outfns(out_dir): let = "ATGC" for umi_short in [ l1 + l2 + l3 + l4 + l5 + l6 for l1 in let for l2 in let for l3 in let for l4 in let for l5 in let for l6 in let ]: out_fn = out_dir + '%s.txt' % (umi_short) util.exists_empty_fn(out_fn) return
def combine_outputs(out_dir): # Concatenates all split outputs together # into the main output directory out_splits = [ out_dir + 'split' + str(s) + '/' for s in range(_parallel_config.SPLITS) ] fns = set() for s in out_splits: for fn in os.listdir(s): if fnmatch.fnmatch(s + fn, _parallel_config.REGEX_FILTER): fns.add(fn) for fn in fns: util.exists_empty_fn(out_dir + fn) print '\tCombining', fn, '...' locs = [s + fn for s in out_splits] subprocess.call('cat ' + ' '.join(locs) + ' > ' + out_dir + fn, shell=True) return
def matchmaker(nm, split): ##CUSTOM CODE FOR DICTIONARY CREATION from Bio import pairwise2 from Bio.pairwise2 import format_alignment from Bio.Seq import Seq from Bio.Alphabet import generic_dna def rc(inp): d = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} return "".join([d[e] for e in inp.strip()[::-1]]) #UNSPLICED DATA PROCESSING READ1_TEMPLATE = "NNNtaccagctgccctcgTCGaCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGNNNNNNNNNNNNNNNNNNNNNNNNtgattacacatatagacacgcGAGCAGCCATCTTTTATAGAATGGGtagaacccgtcctaaggactcagattgagcatcgtttgcttctcgagtactacctgg" READ2_TEMPLATE = "NNNaaccgctgtgttctgcACGCGTNNNNNNNNNNNNNNNNNNACCGGTgcaggtaatgggccttactatcagtctcagtccttgtacagctcgtccatgccgagagtgatcccggcggcggtcacgaactccagcaggaccatgtgatcgcgcttctcgttggggtctttgctca" r1_seq = Seq(READ1_TEMPLATE, "generic_dna".upper()) r2_seq = Seq(READ2_TEMPLATE, "generic_dna".upper()) def quality(line): q_1 = line.strip() qs = [ord(s) - 33 for s in q_1] return np.mean(qs) i = -1 qc_rejection_count = 0 read1_rejection_count = 0 constant_region_rejection_count = 0 accepted_count = 0 nolib_rejection_count = 0 print nm, split #fq_unspliced_1 = open("/cluster/bh0085/prj/exons/data/{0}_1_sequence.fastq".format(nm)) #fq_unspliced_2 = open("/cluster/bh0085/prj/exons/data/{0}_2_sequence.fastq".format(nm)) stdout_fn = _config.SRC_DIR + 'b3_status_%s_%s.out' % (nm, split) util.exists_empty_fn(stdout_fn) out_dir = out_place + nm + '/' + split + '/' util.ensure_dir_exists(out_dir) inp_fn1 = inp_dir + '%s_1_sequence_%s.fastq' % (nm, split) inp_fn2 = inp_dir + '%s_2_sequence_%s.fastq' % (nm, split) lsh_dict = build_targets_better_lsh() umis_alignments_buffer = init_umis_alignments_buffer() short_outputs = [] prepare_outfns(out_dir) qf = 0 print inp_fn1 tot_reads = util.line_count(inp_fn1) timer = util.Timer(total=tot_reads) i = -1 print "OPENING FILES" with open(inp_fn1) as f1: with open(inp_fn2) as f2: while 1: i += 1 try: r2_l = f2.next() r1_l = f1.next() except StopIteration as e: break if i % 4 == 1: read1 = r1_l read2 = r2_l if i % 4 == 3: if quality(r2_l) < 28 or quality(r1_l) < 28: qc_rejection_count += 1 continue r1_library_constant = "TACCAGCTGCCCTCGTCGAC".upper() r1_library_start = len(r1_library_constant) r1_library_format = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGNNNNNNNNNNNNNNNNNNNNNNNN" r1_library_intron_format = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAG" r1_library_ag_pos = len( "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN") r1_library_exon_format = "NNNNNNNNNNNNNNNNNNNNNNNN" r2_library_constant = "ggggtgttctgctggtagtggtc".upper() r2_library_start = len(r2_library_constant) r2_umi_format = "NNNNNNNNNNNNNNN" try: a1_offset = read1.upper().index( r1_library_constant.upper()) except ValueError, e: a1_offset = None try: a2_offset = read2.upper().index(r2_library_constant) except ValueError, e: a2_offset = None if a1_offset is None or a2_offset is None: constant_region_rejection_count += 1 continue read1_const = read1[a1_offset:a1_offset + r1_library_start] read1_content = read1[ a1_offset + r1_library_start:][:len(r1_library_format)] read1_extended_content = read1[a1_offset + r1_library_start:] read2_const = read1[a2_offset:a2_offset + r2_library_start] read2_content = read2[a2_offset + r2_library_start:][:len(r2_umi_format )] read2_extended_content = read1[a2_offset + r2_library_start:] r1_ag = read1_content[len(r1_library_intron_format) - 2:len(r1_library_intron_format)] #check to see that the splice acceptor is in the right position #and that the read1 constant sequence aligned #if a1_tag_score <20: # read1_rejection_count+=1 # continue tag = "TACCANCTGCCCTCGTCGAC" umi = read2_content[:len(r2_umi_format)] lib = read1_content[:len(r1_library_format)] lib_extended = read1_extended_content[:len( r1_library_format) + 20] if umi.count("N") != 0 or lib.count("N") != 0: continue #no longer check for perfect matches. Just align exp = target_names.get(lib, None) cand_idxs = find_best_designed_target(lib, lsh_dict) if len(cand_idxs) == 0: print "rejecting for no good match" nolib_rejection_count += 1 continue best_idx = cand_idxs[0] #extends a target alignment region to include an extra 20 bases to anchor the alignment for long r1 deletions target_alignment_region = names_targets[ best_idx] + "tgattacacatatagacacg".upper() align = pairwise2.align.localms(target_alignment_region, read1_extended_content, 2, -1, -5, -.1)[0] output_complete = """>1\n{0}\n{1}\n{2}\n{3}\n""".format( umi, best_idx, align[2], "\n".join(format_alignment(*align).splitlines()[:3])) output_short = (umi, best_idx) umis_alignments_buffer[umi].append(output_complete) short_outputs.append(output_short) accepted_count += 1 if i % int(tot_reads / 1000) < 4 and i > 1: print i print "FLUSHING!" # Flush alignment buffer flush_tuples(umis_alignments_buffer, out_dir) print len(umis_alignments_buffer.keys()) # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_reads / 100))) outf.write('Quality filtered pct: %s\n' % (qf / (i / 4))) outf.write( "accepted {0}, rejected {1} bad read1, {2} bad lib\n" .format(accepted_count, read1_rejection_count, nolib_rejection_count)) timer.update()
def matchmaker(nm, split): read_constant_rejection_count = 0 qc_rejection_count = 0 accepted_count = 0 grna_failure_count = 0 read1_rejection_count = 0 ##CUSTOM CODE FOR DICTIONARY CREATION from Bio import pairwise2 from Bio.pairwise2 import format_alignment from Bio.Seq import Seq from Bio.Alphabet import generic_dna def rc(inp): d = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} return "".join([d[e] for e in inp.strip()[::-1]]) #UNSPLICED DATA PROCESSING READ1_TEMPLATE = "NNNtaccagctgccctcgTCGaCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGNNNNNNNNNNNNNNNNNNNNNNNNtgattacacatatagacacgcGAGCAGCCATCTTTTATAGAATGGGtagaacccgtcctaaggactcagattgagcatcgtttgcttctcgagtactacctgg" READ2_TEMPLATE = "NNNaaccgctgtgttctgcACGCGTNNNNNNNNNNNNNNNNNNACCGGTgcaggtaatgggccttactatcagtctcagtccttgtacagctcgtccatgccgagagtgatcccggcggcggtcacgaactccagcaggaccatgtgatcgcgcttctcgttggggtctttgctca" r1_seq = Seq(READ1_TEMPLATE, "generic_dna".upper()) r2_seq = Seq(READ2_TEMPLATE, "generic_dna".upper()) def quality(line): q_1 = line.strip() qs = [ord(s) - 33 for s in q_1] return np.mean(qs) i = -1 print nm, split umis_alignments_buffer = init_umis_alignments_buffer() stdout_fn = _config.SRC_DIR + 'b7_status_%s_%s.out' % (nm, split) util.exists_empty_fn(stdout_fn) out_dir = out_place + nm + '/' + split + '/' util.ensure_dir_exists(out_dir) inp_fn1 = inp_dir + '%s_1_sequence_%s.fastq' % (nm, split) inp_fn2 = inp_dir + '%s_2_sequence_%s.fastq' % (nm, split) short_outputs = [] prepare_outfns(out_dir) qf = 0 tot_reads = util.line_count(inp_fn1) timer = util.Timer(total=tot_reads) #raise Exception() i = -1 with open(inp_fn1) as f1: with open(inp_fn2) as f2: while 1: i += 1 try: r2_l = f2.next() r1_l = f1.next() except StopIteration as e: break if i % 4 == 1: read1 = r1_l read2 = r2_l if i % 4 == 3: if quality(r2_l) < 28 or quality(r1_l) < 28: qc_rejection_count += 1 continue print read1 print read2 print len(read2) r1_grna19_format = "N" * 19 r1_grna20_format = "N" * 20 r2_umi_format = "N" * 15 r1_prefix_constant = "GACGAAACACCG".upper() r1_grna_start = len(r1_prefix_constant) r2_prefix_constant = "tcaaacaggacggcagcgtgcagctcgcc".upper( ) r2_umi_start = len(r2_prefix_constant) r2_umi_format = "N" * 15 r2_post_umi_format = "gaccactaccagcagaacacccc".upper() print "working" try: print r1_prefix_constant a1_offset = read1.upper().index( r1_prefix_constant.upper()) except Exception, e: read1_rejection_count += 1 a1_offset = None print "A1 EXCEPTION" continue try: a2_offset = read2.upper().index( r2_prefix_constant.upper()) except Exception, e: a2_offset = None read_constant_rejection_count += 1 print "A2 REJECTION" continue read1_grna19 = read1[a1_offset + r1_grna_start:][:len(r1_grna19_format )] read1_grna20 = read1[a1_offset + r1_grna_start:][:len(r1_grna20_format )] read2_umi_content = read2[a2_offset + r2_umi_start:][:len(r2_umi_format )] print a2_offset print r2_umi_start print len(r2_umi_format) print len(read2_umi_content) #raise Exception() design_row = exp_design.loc[exp_design[ "Designed gRNA (NGG orientation, 19 and 20)"] == read1_grna20] if len(design_row) == 0: design_row = exp_design.loc[exp_design[ "Designed gRNA (NGG orientation, 19 and 20)"] == read1_grna19] if len(design_row) == 0: grna_failure_count += 1 continue design_row = design_row.iloc[0] output_complete = """>1\n{0}\n{1}""".format( read2_umi_content, design_row["Identifier number"]) output_short = (read2_umi_content, design_row["Identifier number"]) print output_short umis_alignments_buffer[read2_umi_content].append( output_complete) short_outputs.append(output_short) accepted_count += 1 if i % int(tot_reads / 10) < 4 and i > 1: print "FLUSHING!" print accepted_count # Flush alignment buffer flush_tuples(umis_alignments_buffer, out_dir) print len(umis_alignments_buffer.keys()) # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_reads / 100))) outf.write('Quality filtered pct: %s\n' % (qf / (i / 4))) outf.write( "accepted {0}, rejected {1} bad read1\n{2} rc rejection\n" .format(accepted_count, read1_rejection_count, read_constant_rejection_count)) timer.update()
def matchmaker(nm, split): print(nm, split) stdout_fn = _config.SRC_DIR + 'nh_c_%s_%s.out' % (nm, split) util.exists_empty_fn(stdout_fn) out_dir = out_place + nm + '/' + split + '/' util.ensure_dir_exists(out_dir) read1_fn = inp_dir + '%s_R1_%s.fq' % (nm, split) read2_fn = inp_dir + '%s_R2_%s.fq' % (nm, split) lsh_dict = build_targets_better_lsh() alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir) num_bad_matches = 0 quality_pass = 0 tot_lines = util.line_count(read1_fn) timer = util.Timer(total=tot_lines) with open(read1_fn) as f1, open(read2_fn) as f2: for i, (line1, line2) in enumerate(zip(f1, f2)): if i % 4 == 0: h1 = line1.strip() h2 = line2.strip() if i % 4 == 1: # RC of l1 contains target line1 = line1.strip() target_read = compbio.reverse_complement(line1[:61]) ulmi, ulmi_idx = find_ulmi(line1) # l2 contains gRNA grna_read = line2.strip() if i % 4 == 3: q1, q2 = line1.strip(), line2.strip() read_q = q1[:61][::-1] ulmi_q = q1[ulmi_idx:ulmi_idx + len(ulmi)][::-1] grna_q = q2[18:22 + 20] qs = [ord(s) - 33 for s in read_q + ulmi_q + grna_q] if np.mean(qs) >= 28: quality_pass += 1 align_header = '>1_%s_%s' % (ulmi, ulmi_q) # Try to find designed target from LSH cand_idxs = find_best_designed_target( target_read, lsh_dict) if len(cand_idxs) > 0: bad_match = compare_target_to_grna( cand_idxs, grna_read) if bad_match == 'ok': # Run alignment and store in buffer best_idx, align = alignment(target_read, cand_idxs) if align is None: continue store_alignment(alignment_buffer, best_idx, align_header, align, read_q) else: num_bad_matches += 1 else: num_bad_matches += 1 if i % int(tot_lines / 200) == 1 and i > 1: # Flush alignment buffer flush_alignments(alignment_buffer, out_dir) alignment_buffer = init_alignment_buffer() # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_lines / 100))) outf.write('Num. mismatched gRNA/target pairs: %s\n' % (num_bad_matches)) outf.write('Frac. mismatched gRNA/target pairs: %s\n' % (num_bad_matches / quality_pass)) timer.update() # Final flush flush_alignments(alignment_buffer, out_dir) return
def prepare_outfns(out_dir): for exp in names_targets: out_fn = out_dir + '%s.txt' % (exp) util.exists_empty_fn(out_fn) return
def matchmaker(nm, split): print(split) stdout_fn = _config.SRC_DIR + f'nh_c_{nm}_{split}.out' util.exists_empty_fn(stdout_fn) out_dir = f'{out_place}{nm}/{split}/' util.ensure_dir_exists(out_dir) # Parse condition-specific settings exp_row = exp_design[exp_design['Name'] == nm].iloc[0] parent_fn = exp_row['Parent file'] lib_nm = exp_row['Library'] target_nm = exp_row['Target'] # Library design global lib_design lib_design = pd.read_csv(_config.DATA_DIR + f'lib_{lib_nm}_design.csv') global prefixes global peptide_nms global prefix_to_peptide global suffixes global suffix_to_peptide prefixes = [s[:prefix_len] for s in lib_design['Sequence']] peptide_nms = list(lib_design['Name']) prefix_to_peptide = {prefix: nm for prefix, nm in zip(prefixes, peptide_nms)} suffixes = [compbio.reverse_complement(s[-suffix_len:]) for s in lib_design['Sequence']] suffix_to_peptide = {suffix: nm for suffix, nm in zip(suffixes, peptide_nms)} # Target target_row = target_design[target_design['Target'] == target_nm].iloc[0] target = target_row['Sequence'] target_strand = target_row['gRNA orientation'] zf_split = str(split).zfill(3) read1_fn = inp_dir + f'{parent_fn}_R1_{zf_split}.fq' read2_fn = inp_dir + f'{parent_fn}_R2_{zf_split}.fq' count_stats = defaultdict(lambda: 0) count_stats['Success'] = 0 alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir, peptide_nms) tot_lines = util.line_count(read1_fn) timer = util.Timer(total = tot_lines) with open(read1_fn) as f1, open(read2_fn) as f2: for i, (line1, line2) in enumerate(zip(f1, f2)): if i % 4 == 0: h1 = line1.strip() h2 = line2.strip() if i % 4 == 1: read1 = line1.strip() read2 = line2.strip() if i % 4 == 3: q1, q2 = line1.strip(), line2.strip() count_stats['Read count'] += 1 qs = [ord(s)-33 for s in q1 + q2] if np.mean(qs) < 25: count_stats['1a. Quality fail'] += 1 continue res, msg = find_peptide1_nm(read2) if res is None: count_stats[f'2{msg}'] += 1 continue p1_nm = res res, msg = find_peptide2_nm(read1) if res is None: count_stats[f'2{msg}'] += 1 continue p2_nm = res peptide_nm = f'{p1_nm}-{p2_nm}' read1 = read1[6:] q1 = q1[6:] if target_strand == '-': read1 = compbio.reverse_complement(read1) q1 = q1[::-1] # Run alignment and store in buffer align_header = f'>1' align = alignment(read1, target) store_alignment(alignment_buffer, peptide_nm, align_header, align, q1) count_stats['Success'] += 1 # flush_interval = 2000 flush_interval = 200 if i % int(tot_lines / flush_interval) == 1 and i > 1: # Flush alignment buffer flush_alignments(alignment_buffer, out_dir) alignment_buffer = init_alignment_buffer() # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write(f'Time: {datetime.datetime.now()}\n') outf.write(f'Progress: {i / int(tot_lines / 100)}\n') outf.write(f'Line: {i}\n') for key in sorted(list(count_stats.keys())): outf.write(f'{key}, {count_stats[key]}\n') # break timer.update() # Final flush flush_alignments(alignment_buffer, out_dir) stats_df = pd.DataFrame(count_stats, index = [0]) sorted_cols = sorted([s for s in stats_df.columns]) stats_df = stats_df[sorted_cols] stats_df.to_csv(out_dir + f'stats_{nm}_{split}.csv') return
def prepare_outfns(out_dir, peptide_nms): for p1 in list(peptide_nms): for p2 in list(peptide_nms): out_fn = out_dir + f'{p1}-{p2}.txt' util.exists_empty_fn(out_fn) return
def matchmaker(nm, split): print nm, split stdout_fn = _config.SRC_DIR + 'nh_c_%s_%s.out' % (nm, split) util.exists_empty_fn(stdout_fn) out_dir = out_place + nm + '/' + split + '/' util.ensure_dir_exists(out_dir) inp_fn = inp_dir + '%s_r2_%s.fq' % (nm, split) lsh_dict = build_targets_better_lsh() alignment_buffer = init_alignment_buffer() prepare_outfns(out_dir) qf = 0 tot_reads = util.line_count(inp_fn) timer = util.Timer(total = tot_reads) from itertools import izip with open(inp_fn) as f: for i, line in enumerate(f): if i % 4 == 0: pass if i % 4 == 1: l2 = line.strip() if i % 4 == 3: # Quality filter q2 = line.strip() qs = [ord(s)-33 for s in q2] if np.mean(qs) < 28: qf += 1 continue l2 = compbio.reverse_complement(l2) align_header = '>1' # Try to find designed target from LSH cand_idxs = find_best_designed_target(l2, lsh_dict) if len(cand_idxs) == 0: continue # Run alignment best_idx, align = alignment(l2, cand_idxs) # Store alignment into buffer store_alignment(alignment_buffer, best_idx, align_header, align) if i % int(tot_reads / 100) == 1 and i > 1: # Flush alignment buffer flush_alignments(alignment_buffer, out_dir) alignment_buffer = init_alignment_buffer() # Stats for the curious with open(stdout_fn, 'a') as outf: outf.write('Time: %s\n' % (datetime.datetime.now())) outf.write('Progress: %s\n' % (i / int(tot_reads / 100)) ) outf.write('Quality filtered pct: %s\n' % (qf / (i/4))) timer.update() # Final flush flush_alignments(alignment_buffer, out_dir) return