def validateOptions(options): if options.cupstream not in [0, 1, 2]: raise TaskError("Invalid cupstream value provided") if options.cutoff not in [3, 3.5, 4]: raise TaskError("Invalid cutoff value provided") RVD_re = re.compile(RVD_SEQ_REGEX, re.IGNORECASE | re.MULTILINE) if not RVD_re.match(options.rvdString): raise TaskError("RVD sequence is not in the correct format. Enter between 12 and 31 RVDs using the standard single letter amino acid abbreviations.") if options.ncbi != "NA": options.ncbi = options.ncbi.strip() if options.genome or options.promoterome: raise TaskError("--genome and --promoterome options cannot be combined with --ncbi") # NCBI sequence validation is performed after the task has started instead of here to avoid having to download large files more than once else: if ((options.genome and options.organism not in VALID_GENOME_ORGANISMS) or (options.promoterome and options.organism not in VALID_PROMOTEROME_ORGANISMS)): raise TaskError("Invalid organism specified.") if not options.genome and not options.promoterome: with open(options.fasta, 'r') as seq_file: check_fasta_pasta(seq_file)
def RunTalesfTask(options): logger = create_logger(options.logFilepath) logger("Beginning") if options.revcomp: forwardOnly = False else: forwardOnly = True if options.ncbi != "NA": logger("Retrieving NCBI sequence. This could take a while if this sequence hasn't been used recently and needs to be downloaded from NCBI.") with Conditional(options.ncbi != "NA", CachedEntrezFile(logger, options.ncbi)) as maybe_entrez_file: if options.ncbi != "NA": # Validate downloaded sequence check_fasta_pasta(maybe_entrez_file.file) if options.ncbi != "NA": seqFilename = maybe_entrez_file.filepath elif options.genome: seqFilename = GENOME_FILE % options.organism elif options.promoterome: seqFilename = PROMOTEROME_FILE % options.organism else: seqFilename = options.fasta result = ScoreTalesfTask(seqFilename, options.rvdString, options.outputFilepath, options.logFilepath, forwardOnly, options.cupstream, options.cutoff, 4, options.organism if options.genome else "") if(result == 1): raise TaskError()
def validateOptions(options): if options.fasta == 'NA': raise TaskError("FASTA file required.") if options.cupstream not in [0, 1, 2]: raise TaskError("Invalid cupstream value provided") if options.arraymin < 10 or options.arraymin > 35: raise TaskError("Minimum repeat array length must be between 10 and 35") if options.arraymax < 10 or options.arraymax > 35: raise TaskError("Maximum repeat array length must be between 10 and 35") if options.arraymax < options.arraymin: raise TaskError("Maximum repeat array length must be greater than the minimum repeat array length") if options.min < 10 or options.min > 35: raise TaskError("Minimum spacer length must be between 10 and 35") if options.max < 10 or options.max > 35: raise TaskError("Maximum spacer length must be between 10 and 35") if options.max < options.min: raise TaskError("Maximum spacer length cannot be less than the minimum spacer length") with open(options.fasta, 'r') as seq_file: check_fasta_pasta(seq_file)
def validateOptions(options): if options.fasta == 'NA': raise TaskError('FASTA file required.') if options.cupstream not in [0, 1, 2]: raise TaskError("Invalid cupstream value provided") if options.arraymin < 10 or options.arraymin > 35: raise TaskError("Minimum repeat array length must be between 10 and 35") if options.arraymax < 10 or options.arraymax > 35: raise TaskError("Maximum repeat array length must be between 10 and 35") if options.arraymax < options.arraymin: raise TaskError("Maximum repeat array length must be greater than the minimum repeat array length") if options.offtargets_ncbi != "NA": if options.offtargets_fasta != "NA" or options.genome or options.promoterome: raise TaskError("--offtargets-fasta, --genome and --promoterome options cannot be combined with --offtargets-ncbi") # NCBI sequence validation is performed after the task has started instead of here to avoid having to download large files more than once options.check_offtargets = True if options.offtargets_fasta != "NA": if options.offtargets_ncbi != "NA" or options.genome or options.promoterome: raise TaskError("--offtargets-ncbi, --genome and --promoterome options cannot be combined with --offtargets-fasta") if (not os.path.exists(options.offtargets_fasta) or os.path.getsize(options.offtargets_fasta) <= 2): raise TaskError("Off-target FASTA file must exist and be non-empty.") options.check_offtargets = True if options.check_offtargets: if ((options.genome and options.organism not in VALID_GENOME_ORGANISMS) or (options.promoterome and options.organism not in VALID_PROMOTEROME_ORGANISMS)): raise TaskError("Invalid organism specified.") with open(options.fasta, 'r') as seq_file: check_fasta_pasta(seq_file) if options.check_offtargets: gene_length_total = 0 for gene in FastaIterator(seq_file, alphabet=generic_dna): gene_length_total += len(gene.seq) if gene_length_total > 1000: raise TaskError("Off-target counting is only available when designing TALEs for sequences that are 1000 bases or less")
def validateOptions(options): if options.cupstream not in [0, 1, 2]: raise TaskError("Invalid cupstream value provided") if options.cutoff not in [3, 3.5, 4]: raise TaskError("Invalid cutoff value provided") RVD_re = re.compile(RVD_SEQ_REGEX, re.IGNORECASE | re.MULTILINE) if not RVD_re.match(options.rvdString): raise TaskError("RVD sequence is not in the correct format. Enter between 12 and 31 RVDs using the standard single letter amino acid abbreviations.") if ((options.genome and options.organism not in VALID_GENOME_ORGANISMS) or (options.promoterome and options.organism not in VALID_PROMOTEROME_ORGANISMS)): raise TaskError("Invalid organism specified.") if not options.genome and not options.promoterome: with open(options.fasta, 'r') as seq_file: check_fasta_pasta(seq_file)
def RunFindTALTask(options): logger = create_logger(options.logFilepath) logger("Beginning") if options.check_offtargets and options.offtargets_ncbi != "NA": logger( "Retrieving NCBI off-target sequence. This could take a while if this sequence hasn't been used recently and needs to be downloaded from NCBI." ) with Conditional( options.check_offtargets and options.offtargets_ncbi != "NA", CachedEntrezFile(logger, options.offtargets_ncbi) ) as maybe_entrez_file: if options.check_offtargets: if not tfcount_found: raise TaskError("Non off-target counting worker attempted to process off-target counting task.") if options.offtargets_ncbi != "NA": logger("Finished retrieving NCBI off-target sequence.") # Validate downloaded sequence check_fasta_pasta(maybe_entrez_file.file) for record in FastaIterator(maybe_entrez_file.file, alphabet=generic_dna): if len(record.seq) > OFFTARGET_COUNTING_SIZE_LIMIT: raise TaskError( "Off-Target counting is only supported for NCBI records where all individual sequences are under %d megabases in size" % (OFFTARGET_COUNTING_SIZE_LIMIT / 1000000) ) offtarget_seq_filename = "" if options.offtargets_fasta != "NA": offtarget_seq_filename = options.offtargets_fasta elif options.offtargets_ncbi != "NA": offtarget_seq_filename = maybe_entrez_file.filepath elif options.genome: offtarget_seq_filename = GENOME_FILE % options.organism elif options.promoterome: offtarget_seq_filename = PROMOTEROME_FILE % options.organism else: offtarget_seq_filename = options.fasta strong_binding_RVDs = {"A": "NI", "C": "HD", "G": "NN", "T": "NG"} if options.gspec: strong_binding_RVDs["G"] = "NH" seq_file = open(options.fasta, "r") if options.outpath == "NA": output_filepath = options.outdir + options.job + options.outfile else: output_filepath = options.outpath out = open(output_filepath, "w") table_ignores = ["TAL1 length", "TAL2 length", "Spacer length"] out.write("table_ignores:" + ",".join(table_ignores) + "\n") strand_min = 15 if options.arraymin is None else options.arraymin strand_max = 20 if options.arraymax is None else options.arraymax spacer_min = 15 if options.min is None else options.min spacer_max = 30 if options.max is None else options.max u_bases = [] if options.cupstream != 1: u_bases.append("T") if options.cupstream != 0: u_bases.append("C") out.write( "options_used:" + ", ".join( [ "array_min = " + str(strand_min), "array_max = " + str(strand_max), "spacer_min = " + str(spacer_min), "spacer_max = " + str(spacer_max), "upstream_base = " + (" or ".join(u_bases)), ] ) + "\n" ) offtarget_header = "\tOff-Target Counts" if options.check_offtargets else "" out.write( "Sequence Name\tCut Site\tTAL1 start\tTAL2 start\tTAL1 length\tTAL2 length\tSpacer length\tSpacer range\tTAL1 RVDs\tTAL2 RVDs\tPlus strand sequence\tUnique RE sites in spacer\t% RVDs HD or NN/NH" + offtarget_header + "\n" ) binding_sites = [] for gene in FastaIterator(seq_file, alphabet=generic_dna): sequence = str(gene.seq).upper() site_entry_counts = {} if options.filter == 1: if options.filterbase > len(sequence): logger("Skipped %s as the provided cut site was greater than the sequence length" % (gene.id)) continue cut_site_positions = [options.filterbase] else: cut_site_positions = range(len(sequence)) logger("Scanning %s for binding sites" % (gene.id)) for i in cut_site_positions: cut_site_potential_sites = [] for spacer_size in range(spacer_min, spacer_max + 1): spacer_potential_sites = [] spacer_size_left = int(math.floor(float(spacer_size) / 2)) spacer_size_right = int(math.ceil(float(spacer_size) / 2)) if i < (strand_min + spacer_size_left + 1) or i > ( len(sequence) - (strand_min + spacer_size_right) - 1 ): continue for u_base in u_bases: if u_base == "T": d_base = "A" elif u_base == "C": d_base = "G" u_pos_search_start = i - (strand_max + spacer_size_left) - 1 if u_pos_search_start < 0: u_pos_search_start = 0 u_pos_search_end = i - (strand_min + spacer_size_left) d_pos_search_start = i + (strand_min + spacer_size_right) d_pos_search_end = i + (strand_max + spacer_size_right) + 1 u_positions = [] u_pos = 0 while True: u_pos = sequence.rfind(u_base, u_pos_search_start, u_pos_search_end) if u_pos == -1: break else: u_pos_search_end = u_pos u_positions.append(u_pos) d_positions = [] d_pos = 0 while True: d_pos = sequence.find(d_base, d_pos_search_start, d_pos_search_end) if d_pos == -1: break else: d_pos_search_start = d_pos + 1 d_positions.append(d_pos) break_out = False for u_pos in reversed(u_positions): for d_pos in reversed(d_positions): # uses inclusive start, exclusive end tal1_start = u_pos + 1 tal1_end = i - spacer_size_left tal1_seq = sequence[tal1_start:tal1_end] tal2_start = i + spacer_size_right tal2_end = d_pos tal2_seq = sequence[tal2_start:tal2_end] if not ( (tal1_seq in site_entry_counts and tal2_seq in site_entry_counts[tal1_seq]) or (tal1_seq in site_entry_counts and tal1_seq in site_entry_counts[tal1_seq]) or (tal2_seq in site_entry_counts and tal1_seq in site_entry_counts[tal2_seq]) or (tal2_seq in site_entry_counts and tal2_seq in site_entry_counts[tal2_seq]) ): bad_site = False cg_count = 0 tal1_rvd = [] for c in tal1_seq: if c not in strong_binding_RVDs: bad_site = True break if c == "C" or c == "G": cg_count += 1 tal1_rvd.append(strong_binding_RVDs[c]) if bad_site: continue tal1_rvd = " ".join(tal1_rvd) tal2_rvd = [] for c in reverseComplement(tal2_seq): if c not in strong_binding_RVDs: bad_site = True break if c == "C" or c == "G": cg_count += 1 tal2_rvd.append(strong_binding_RVDs[c]) if bad_site: continue tal2_rvd = " ".join(tal2_rvd) if options.filter == 0: break_out = True binding_site = BindingSite( seq_id=gene.id, cutsite=i, seq1_start=tal1_start, seq1_end=tal1_end, seq1_seq=tal1_seq, seq1_rvd=tal1_rvd, spacer_start=tal1_end, spacer_end=tal2_start, spacer_seq=sequence[tal1_end:tal2_start], seq2_start=tal2_start, seq2_end=tal2_end, seq2_seq=tal2_seq, seq2_rvd=tal2_rvd, upstream=u_base, cg_percent=int( round(float(cg_count) / (len(tal1_seq) + len(tal2_seq)), 2) * 100 ), ) findRESitesInSpacer(sequence, binding_site) if binding_site.seq1_seq not in site_entry_counts: site_entry_counts[binding_site.seq1_seq] = {} if binding_site.seq2_seq not in site_entry_counts[tal1_seq]: site_entry_counts[binding_site.seq1_seq][binding_site.seq2_seq] = [] site_entry_counts[binding_site.seq1_seq][binding_site.seq2_seq].append(binding_site) spacer_potential_sites.append(binding_site) if break_out: break if break_out: break if len(spacer_potential_sites) > 0: if options.filter == 0: cut_site_potential_sites.append(reduce(filterByTALSize, spacer_potential_sites)) else: cut_site_potential_sites.extend(spacer_potential_sites) if len(cut_site_potential_sites) > 0: if options.filter == 0: binding_sites.append(reduce(filterByTALSize, cut_site_potential_sites)) else: binding_sites.extend(cut_site_potential_sites) if options.streubel: binding_sites[:] = list(ifilterfalse(filterStreubel, binding_sites)) if options.check_offtargets: if len(binding_sites) > 0: off_target_pairs = [] for i, binding_site in enumerate(binding_sites): off_target_pairs.append([binding_site.seq1_rvd, binding_site.seq2_rvd]) off_target_counts = PairedTargetFinderCountTask( offtarget_seq_filename, options.logFilepath, options.cupstream, 3.0, spacer_min, spacer_max, off_target_pairs, ) for i, binding_site in enumerate(binding_sites): binding_site.offtarget_counts = off_target_counts[i] for i, binding_site in enumerate(binding_sites): output_items = [ str(binding_site.seq_id), str(binding_site.cutsite), str(binding_site.seq1_start), str(binding_site.seq2_end - 1), str(binding_site.seq1_end - binding_site.seq1_start), str(binding_site.seq2_end - binding_site.seq2_start), str(binding_site.spacer_end - binding_site.spacer_start), str(binding_site.spacer_start) + "-" + str(binding_site.spacer_end - 1), binding_site.seq1_rvd, binding_site.seq2_rvd, binding_site.upstream + " " + binding_site.seq1_seq + " " + binding_site.spacer_seq.lower() + " " + binding_site.seq2_seq + " " + ("A" if binding_site.upstream == "T" else "G"), binding_site.re_sites, str(binding_site.cg_percent), ] if options.check_offtargets: output_items.append(" ".join(str(binding_site.offtarget_counts[x]) for x in range(5))) out.write("\t".join(output_items) + "\n") out.close() seq_file.close() logger("Finished")
def RunFindSingleTALSiteTask(options): logger = create_logger(options.logFilepath) logger("Beginning") if options.check_offtargets and options.offtargets_ncbi != "NA": logger("Retrieving NCBI off-target sequence. This could take a while if this sequence hasn't been used recently and needs to be downloaded from NCBI.") with Conditional(options.check_offtargets and options.offtargets_ncbi != "NA", CachedEntrezFile(logger, options.offtargets_ncbi)) as maybe_entrez_file: if options.check_offtargets: if not tfcount_found: raise TaskError("Non off-target counting worker attempted to process off-target counting task.") if options.offtargets_ncbi != "NA": logger("Finished retrieving NCBI off-target sequence.") # Validate downloaded sequence check_fasta_pasta(maybe_entrez_file.file) for record in FastaIterator(maybe_entrez_file.file, alphabet=generic_dna): if len(record.seq) > OFFTARGET_COUNTING_SIZE_LIMIT: raise TaskError("Off-Target counting is only supported for NCBI records where all individual sequences are under %d megabases in size" % (OFFTARGET_COUNTING_SIZE_LIMIT / 1000000)) offtarget_seq_filename = "" if options.offtargets_fasta != "NA": offtarget_seq_filename = options.offtargets_fasta elif options.offtargets_ncbi != "NA": offtarget_seq_filename = maybe_entrez_file.filepath elif options.genome: offtarget_seq_filename = GENOME_FILE % options.organism elif options.promoterome: offtarget_seq_filename = PROMOTEROME_FILE % options.organism else: offtarget_seq_filename = options.fasta strong_binding_RVDs = { 'A':'NI', 'C':'HD', 'G':'NN', 'T':'NG' } if options.gspec: strong_binding_RVDs['G'] = 'NH' seq_file = open(options.fasta, 'r') #Set other parameters if options.arraymin is None or options.arraymax is None: half_site_size = range(15, 31) else: half_site_size = range(options.arraymin, options.arraymax + 1) #Initialize half site data structures: gene_binding_sites = {} #Open and read FASTA sequence file genes = [] for gene in FastaIterator(seq_file, alphabet=generic_dna): genes.append(gene) seq_file.close() for gene in genes: gene.seq = gene.seq.upper() #Scan each gene sequence: for gene in genes: #Scan sequence based on above criteria: logger("Scanning %s for binding sites" % (gene.id)) sequence = gene.seq #Check each position along the sequence for possible binding sites using all combinations of binding site lengths and spacer lengths for size1 in half_site_size: for sindex in range(1, len(sequence)-size1): #Check for T at -1 if ((options.cupstream != 1 and sequence[sindex-1] == 'T') or (options.cupstream != 0 and sequence[sindex-1] == 'C')) and len(set(DNA) | set(sequence[sindex:sindex+size1])) ==4: half_site1 = sequence[sindex:sindex+size1] Binding_site_flag = True #Check for not T at 1 if Binding_site_flag==True and options.t1==True: if sequence[sindex] != 'T': Binding_site_flag=True else: Binding_site_flag=False #Check not A at 2 if Binding_site_flag==True and options.a2==True: if sequence[sindex+1] !='A': Binding_site_flag=True else: Binding_site_flag=False #Require T at end if Binding_site_flag==True and options.tn==True: if sequence[sindex+size1-1] == 'T': Binding_site_flag=True else: Binding_site_flag=False #Require last position to not be G's if Binding_site_flag==True and options.gn==True: if sequence[sindex+size1-1] != 'G': Binding_site_flag=True else: Binding_site_flag=False #Check nucleotide composition of the binding site if Binding_site_flag==True and options.comp==True: A1 = half_site1.count('A')/float(len(half_site1)) C1 = half_site1.count('C')/float(len(half_site1)) G1 = half_site1.count('G')/float(len(half_site1)) T1 = half_site1.count('T')/float(len(half_site1)) if A1<=percent_comp_range_top['A'] and A1>=percent_comp_range_bottom['A'] and C1<=percent_comp_range_top['C'] and C1>=percent_comp_range_bottom['C'] and G1<=percent_comp_range_top['G'] and G1>=percent_comp_range_bottom['G'] and T1<=percent_comp_range_top['T'] and T1>=percent_comp_range_bottom['T']: Binding_site_flag=True else: Binding_site_flag=False #Create a binding site if all enforced rules have been met if Binding_site_flag==True: binding_site = Binding_site(perfectTAL1 = 'none', start1 = sindex, seq1 = half_site1, is_plus=True, upstream=sequence[sindex-1]) if gene not in gene_binding_sites.keys(): gene_binding_sites[gene] = {} if sindex not in gene_binding_sites[gene].keys(): gene_binding_sites[gene][sindex] = [] gene_binding_sites[gene][sindex].append(binding_site) if options.revcomp==True: #Search for binding sites on the reverse complement strand for sindex in range(size1-1, len(sequence)-1): #Check for T at -1 for each half_site (A on plus strand) if ((options.cupstream != 1 and sequence[sindex+1] == 'A') or (options.cupstream != 0 and sequence[sindex+1] == 'G')) and len(set(DNA) | set(sequence[sindex-size1+1:sindex+1])) == 4: half_site1 = sequence[sindex-size1+1:sindex+1] Binding_site_flag = True #Check for not T at 1 (A at 1 on plus strand) if Binding_site_flag==True and options.t1==True: if sequence[sindex] != 'A': Binding_site_flag=True else: Binding_site_flag=False #Check not A at 2 (T on plus strand) if Binding_site_flag==True and options.a2==True: if sequence[sindex-1] !='T': Binding_site_flag=True else: Binding_site_flag=False #Require T at end so bound by NG (A on plus) if Binding_site_flag==True and options.tn==True: if sequence[sindex-size1+1] =='A': Binding_site_flag=True else: Binding_site_flag=False #Require last position to not be G (C on plus) if Binding_site_flag==True and options.gn==True: if sequence[sindex-size1+1] != 'C': Binding_site_flag=True else: Binding_site_flag=False #Check nucleotide composition of the binding site if Binding_site_flag==True and options.comp==True: A2 = half_site1.count('T')/float(len(half_site1)) C2 = half_site1.count('G')/float(len(half_site1)) G2 = half_site1.count('C')/float(len(half_site1)) T2 = half_site1.count('A')/float(len(half_site1)) if A2<=percent_comp_range_top['A'] and A2>=percent_comp_range_bottom['A'] and C2<=percent_comp_range_top['C'] and C2>=percent_comp_range_bottom['C'] and G2<=percent_comp_range_top['G'] and G2>=percent_comp_range_bottom['G'] and T2<=percent_comp_range_top['T'] and T2>=percent_comp_range_bottom['T']: Binding_site_flag=True else: Binding_site_flag=False #Create a binding site if all enforced rules have been met if Binding_site_flag==True: binding_site = Binding_site(perfectTAL1 = 'none', start1 = sindex, seq1 = half_site1, is_plus=False, upstream=sequence[sindex+1]) if gene not in gene_binding_sites.keys(): gene_binding_sites[gene] = {} if sindex not in gene_binding_sites[gene].keys(): gene_binding_sites[gene][sindex] = [] gene_binding_sites[gene][sindex].append(binding_site) #Compute TALs for each gene, using "strong-binding" RVDs for each nucleotide (binds the nucleotide more than half the time and we have more than 10 observations) logger('Designing best scoring perfect TALs for each potential site...') for gene in gene_binding_sites.keys(): for start in gene_binding_sites[gene].keys(): #Find the perfect RVD sequence from each potential plus strand start site for binding_site in gene_binding_sites[gene][start]: TAL_1 = [] if binding_site.is_plus: for bindex in range(0, len(binding_site.seq1)): TAL_1.append(strong_binding_RVDs[binding_site.seq1[bindex]]) TAL_1 = ' '.join(TAL_1) else: rev_comp_seq = binding_site.seq1.reverse_complement() for bindex in range(0, len(rev_comp_seq)): TAL_1.append(strong_binding_RVDs[rev_comp_seq[bindex]]) TAL_1 = ' '.join(TAL_1) binding_site.perfectTAL1 = TAL_1 #Print output results to file: binding sites #filename = 'upload/'+ options.job + '_TALEN_pairs_all.txt' if options.outpath == 'NA': filename = options.outdir + options.job + options.outfile else: filename = options.outpath binding_sites = [] if len(gene_binding_sites.keys()) > 0: for gene in sorted(gene_binding_sites.keys()): for start_site in gene_binding_sites[gene].keys(): for binding_site in gene_binding_sites[gene][start_site]: binding_site.gene_id = gene.id binding_sites.append(binding_site) if options.check_offtargets: if len(binding_sites) > 0: off_target_seqs = [] for i, binding_site in enumerate(binding_sites): off_target_seqs.append(binding_site.perfectTAL1) off_target_counts = TargetFinderCountTask(offtarget_seq_filename, options.logFilepath, options.cupstream, 3.0, off_target_seqs) for i, binding_site in enumerate(binding_sites): binding_site.offtarget_count = off_target_counts[i] out = open(filename, 'w') table_ignores = [] if not options.revcomp: table_ignores.append("Plus strand sequence") if len(table_ignores) > 0: out.write("table_ignores:" + string.join(table_ignores, ",") + "\n") u_bases = [] if options.cupstream != 1: u_bases.append("T") if options.cupstream != 0: u_bases.append("C") out.write("options_used:" + ', '.join([ "array_min = " + str(options.arraymin), "array_max = " + str(options.arraymax), "upstream_base = " + (" or ".join(u_bases)), ("No T at position 1" if options.t1 else ""), ("No A at position 1" if options.a2 else ""), ("Sites must end in a T" if options.tn else ""), ("Sites may not end in G/NN" if options.gn else ""), ("Base composition rules enforced" if options.comp else ""), ("Search reverse complement" if options.revcomp else ""), ]) + "\n") offtarget_header = "\tOff-Target Counts" if options.check_offtargets else "" out.write('Sequence Name\tTAL start\tTAL length\tRVD sequence\tStrand\tTarget sequence\tPlus strand sequence' + offtarget_header + '\n') for i, binding_site in enumerate(binding_sites): offtarget_string = "" if options.check_offtargets: offtarget_string = "\t%d" % binding_site.offtarget_count if binding_site.is_plus: out.write(binding_site.gene_id + '\t' + str(binding_site.start1) + '\t' + str(len(binding_site.seq1)) + '\t' + binding_site.perfectTAL1 + '\t' + 'Plus' + '\t' + binding_site.upstream + " " + str(binding_site.seq1) + '\t' + binding_site.upstream + " " + str(binding_site.seq1) + offtarget_string + '\n') else: out.write(binding_site.gene_id + '\t' + str(binding_site.start1) + '\t' + str(len(binding_site.seq1)) + '\t' + binding_site.perfectTAL1 + '\t' + 'Minus' + '\t' + ("T" if binding_site.upstream == "A" else "C") + " " + str(binding_site.seq1.reverse_complement()) + '\t' + str(binding_site.seq1) + " " + binding_site.upstream + offtarget_string + '\n') out.close() logger('Finished')