def filterStreubel(binding_site): seq2 = reverseComplement(binding_site.seq2_seq) if float(binding_site.seq1_seq.count("C") + binding_site.seq1_seq.count("G")) / len(binding_site.seq1_seq) < 0.25: return True if len(streubel_at_streak_re.findall(binding_site.seq1_seq)) > 0: return True if float(seq2.count("C") + seq2.count("G")) / len(seq2) < 0.25: return True if len(streubel_at_streak_re.findall(seq2)) > 0: return True return False
def RunFindTALTask(options): logger = create_logger(options.logFilepath) logger("Beginning") if options.check_offtargets and options.offtargets_ncbi != "NA": logger( "Retrieving NCBI off-target sequence. This could take a while if this sequence hasn't been used recently and needs to be downloaded from NCBI." ) with Conditional( options.check_offtargets and options.offtargets_ncbi != "NA", CachedEntrezFile(logger, options.offtargets_ncbi) ) as maybe_entrez_file: if options.check_offtargets: if not tfcount_found: raise TaskError("Non off-target counting worker attempted to process off-target counting task.") if options.offtargets_ncbi != "NA": logger("Finished retrieving NCBI off-target sequence.") # Validate downloaded sequence check_fasta_pasta(maybe_entrez_file.file) for record in FastaIterator(maybe_entrez_file.file, alphabet=generic_dna): if len(record.seq) > OFFTARGET_COUNTING_SIZE_LIMIT: raise TaskError( "Off-Target counting is only supported for NCBI records where all individual sequences are under %d megabases in size" % (OFFTARGET_COUNTING_SIZE_LIMIT / 1000000) ) offtarget_seq_filename = "" if options.offtargets_fasta != "NA": offtarget_seq_filename = options.offtargets_fasta elif options.offtargets_ncbi != "NA": offtarget_seq_filename = maybe_entrez_file.filepath elif options.genome: offtarget_seq_filename = GENOME_FILE % options.organism elif options.promoterome: offtarget_seq_filename = PROMOTEROME_FILE % options.organism else: offtarget_seq_filename = options.fasta strong_binding_RVDs = {"A": "NI", "C": "HD", "G": "NN", "T": "NG"} if options.gspec: strong_binding_RVDs["G"] = "NH" seq_file = open(options.fasta, "r") if options.outpath == "NA": output_filepath = options.outdir + options.job + options.outfile else: output_filepath = options.outpath out = open(output_filepath, "w") table_ignores = ["TAL1 length", "TAL2 length", "Spacer length"] out.write("table_ignores:" + ",".join(table_ignores) + "\n") strand_min = 15 if options.arraymin is None else options.arraymin strand_max = 20 if options.arraymax is None else options.arraymax spacer_min = 15 if options.min is None else options.min spacer_max = 30 if options.max is None else options.max u_bases = [] if options.cupstream != 1: u_bases.append("T") if options.cupstream != 0: u_bases.append("C") out.write( "options_used:" + ", ".join( [ "array_min = " + str(strand_min), "array_max = " + str(strand_max), "spacer_min = " + str(spacer_min), "spacer_max = " + str(spacer_max), "upstream_base = " + (" or ".join(u_bases)), ] ) + "\n" ) offtarget_header = "\tOff-Target Counts" if options.check_offtargets else "" out.write( "Sequence Name\tCut Site\tTAL1 start\tTAL2 start\tTAL1 length\tTAL2 length\tSpacer length\tSpacer range\tTAL1 RVDs\tTAL2 RVDs\tPlus strand sequence\tUnique RE sites in spacer\t% RVDs HD or NN/NH" + offtarget_header + "\n" ) binding_sites = [] for gene in FastaIterator(seq_file, alphabet=generic_dna): sequence = str(gene.seq).upper() site_entry_counts = {} if options.filter == 1: if options.filterbase > len(sequence): logger("Skipped %s as the provided cut site was greater than the sequence length" % (gene.id)) continue cut_site_positions = [options.filterbase] else: cut_site_positions = range(len(sequence)) logger("Scanning %s for binding sites" % (gene.id)) for i in cut_site_positions: cut_site_potential_sites = [] for spacer_size in range(spacer_min, spacer_max + 1): spacer_potential_sites = [] spacer_size_left = int(math.floor(float(spacer_size) / 2)) spacer_size_right = int(math.ceil(float(spacer_size) / 2)) if i < (strand_min + spacer_size_left + 1) or i > ( len(sequence) - (strand_min + spacer_size_right) - 1 ): continue for u_base in u_bases: if u_base == "T": d_base = "A" elif u_base == "C": d_base = "G" u_pos_search_start = i - (strand_max + spacer_size_left) - 1 if u_pos_search_start < 0: u_pos_search_start = 0 u_pos_search_end = i - (strand_min + spacer_size_left) d_pos_search_start = i + (strand_min + spacer_size_right) d_pos_search_end = i + (strand_max + spacer_size_right) + 1 u_positions = [] u_pos = 0 while True: u_pos = sequence.rfind(u_base, u_pos_search_start, u_pos_search_end) if u_pos == -1: break else: u_pos_search_end = u_pos u_positions.append(u_pos) d_positions = [] d_pos = 0 while True: d_pos = sequence.find(d_base, d_pos_search_start, d_pos_search_end) if d_pos == -1: break else: d_pos_search_start = d_pos + 1 d_positions.append(d_pos) break_out = False for u_pos in reversed(u_positions): for d_pos in reversed(d_positions): # uses inclusive start, exclusive end tal1_start = u_pos + 1 tal1_end = i - spacer_size_left tal1_seq = sequence[tal1_start:tal1_end] tal2_start = i + spacer_size_right tal2_end = d_pos tal2_seq = sequence[tal2_start:tal2_end] if not ( (tal1_seq in site_entry_counts and tal2_seq in site_entry_counts[tal1_seq]) or (tal1_seq in site_entry_counts and tal1_seq in site_entry_counts[tal1_seq]) or (tal2_seq in site_entry_counts and tal1_seq in site_entry_counts[tal2_seq]) or (tal2_seq in site_entry_counts and tal2_seq in site_entry_counts[tal2_seq]) ): bad_site = False cg_count = 0 tal1_rvd = [] for c in tal1_seq: if c not in strong_binding_RVDs: bad_site = True break if c == "C" or c == "G": cg_count += 1 tal1_rvd.append(strong_binding_RVDs[c]) if bad_site: continue tal1_rvd = " ".join(tal1_rvd) tal2_rvd = [] for c in reverseComplement(tal2_seq): if c not in strong_binding_RVDs: bad_site = True break if c == "C" or c == "G": cg_count += 1 tal2_rvd.append(strong_binding_RVDs[c]) if bad_site: continue tal2_rvd = " ".join(tal2_rvd) if options.filter == 0: break_out = True binding_site = BindingSite( seq_id=gene.id, cutsite=i, seq1_start=tal1_start, seq1_end=tal1_end, seq1_seq=tal1_seq, seq1_rvd=tal1_rvd, spacer_start=tal1_end, spacer_end=tal2_start, spacer_seq=sequence[tal1_end:tal2_start], seq2_start=tal2_start, seq2_end=tal2_end, seq2_seq=tal2_seq, seq2_rvd=tal2_rvd, upstream=u_base, cg_percent=int( round(float(cg_count) / (len(tal1_seq) + len(tal2_seq)), 2) * 100 ), ) findRESitesInSpacer(sequence, binding_site) if binding_site.seq1_seq not in site_entry_counts: site_entry_counts[binding_site.seq1_seq] = {} if binding_site.seq2_seq not in site_entry_counts[tal1_seq]: site_entry_counts[binding_site.seq1_seq][binding_site.seq2_seq] = [] site_entry_counts[binding_site.seq1_seq][binding_site.seq2_seq].append(binding_site) spacer_potential_sites.append(binding_site) if break_out: break if break_out: break if len(spacer_potential_sites) > 0: if options.filter == 0: cut_site_potential_sites.append(reduce(filterByTALSize, spacer_potential_sites)) else: cut_site_potential_sites.extend(spacer_potential_sites) if len(cut_site_potential_sites) > 0: if options.filter == 0: binding_sites.append(reduce(filterByTALSize, cut_site_potential_sites)) else: binding_sites.extend(cut_site_potential_sites) if options.streubel: binding_sites[:] = list(ifilterfalse(filterStreubel, binding_sites)) if options.check_offtargets: if len(binding_sites) > 0: off_target_pairs = [] for i, binding_site in enumerate(binding_sites): off_target_pairs.append([binding_site.seq1_rvd, binding_site.seq2_rvd]) off_target_counts = PairedTargetFinderCountTask( offtarget_seq_filename, options.logFilepath, options.cupstream, 3.0, spacer_min, spacer_max, off_target_pairs, ) for i, binding_site in enumerate(binding_sites): binding_site.offtarget_counts = off_target_counts[i] for i, binding_site in enumerate(binding_sites): output_items = [ str(binding_site.seq_id), str(binding_site.cutsite), str(binding_site.seq1_start), str(binding_site.seq2_end - 1), str(binding_site.seq1_end - binding_site.seq1_start), str(binding_site.seq2_end - binding_site.seq2_start), str(binding_site.spacer_end - binding_site.spacer_start), str(binding_site.spacer_start) + "-" + str(binding_site.spacer_end - 1), binding_site.seq1_rvd, binding_site.seq2_rvd, binding_site.upstream + " " + binding_site.seq1_seq + " " + binding_site.spacer_seq.lower() + " " + binding_site.seq2_seq + " " + ("A" if binding_site.upstream == "T" else "G"), binding_site.re_sites, str(binding_site.cg_percent), ] if options.check_offtargets: output_items.append(" ".join(str(binding_site.offtarget_counts[x]) for x in range(5))) out.write("\t".join(output_items) + "\n") out.close() seq_file.close() logger("Finished")