def calc_crispr(chrom, seq_set): pos = 0 for sgrna in window(seq_set): if sgrna[4:24].endswith("G"): score = calc_score(sgrna) sgline = map(str,[chrom, pos+4, pos + 24, sgrna[4:24], score, "+"]) print('\t'.join(sgline)) r_sgrna = ''.join([dna_dict[x] for x in sgrna])[::-1] if r_sgrna[4:24].endswith("G"): score = calc_score(r_sgrna) sgline = map(str,[chrom, pos+4, pos+24, r_sgrna[4:24], score, "-"]) print('\t'.join(sgline)) pos += 1
def calc_crispr(chrom, seq_set): pos = 0 for sgrna in window(seq_set): if sgrna[4:24].endswith("G"): score = calc_score(sgrna) sgline = map(str, [chrom, pos + 4, pos + 24, sgrna[4:24], score, "+"]) print('\t'.join(sgline)) r_sgrna = ''.join([dna_dict[x] for x in sgrna])[::-1] if r_sgrna[4:24].endswith("G"): score = calc_score(r_sgrna) sgline = map(str, [chrom, pos + 4, pos + 24, r_sgrna[4:24], score, "-"]) print('\t'.join(sgline)) pos += 1
def find_spacers(target=None, outfile=None, refgenome=None, restriction_sites=[], largeIndex=False, cutoff=0, offtargetcutoff=0, trim=False, logging=False, nuclease='Cas9', return_limit=9, reject=False): with open(target, 'rU') as infile: # Use our modified FastaIterator instead of, perhaps the more proper, SeqIO.parse() method # allows us to trim the UTR sequences off if trim: # If the 'trim' option is enabled, the header for each entry must be # GENEID | TRANSCRIPTID | EXON RANK | CONSTITUTIVE EXON | 5' UTR END | 3' UTR STOP | EXON START | EXON END try: itemiter = m_FastaIterator(infile) except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) else: try: itemiter = SeqIO.parse(infile, 'fasta') except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) itemlist = [temp for temp in itemiter] spacerlist = [] # This will find our 20N-NGG target sequence plus the -4->-3 and +1->+3 nucleotides for scoring if nuclease == 'Cas9': PAM = r'(?i)[ACGT]{25}[G]{2}[ACGT]{3}' elif nuclease == 'Cpf1': PAM = r'(?i)[T]{2,}[A-Z]{25}' rsb = RestrictionBatch(restriction_sites) #spacerlist = map(lambda item: find_each_spacer(item, rsb, cutoff, PAM), itemlist) seen = [] print("{} sequences to search for spacers.".format(len(itemlist))) widgets = ['Examining sequence: ', progressbar.Counter(), ' ', progressbar.Percentage(), ' ', progressbar.Bar(), progressbar.Timer()] progress = progressbar.ProgressBar(widgets=widgets, maxval=len(itemlist)).start() spacer_re = regex.compile(PAM) #pol3stop = regex.compile(r'(?i)[T]{4,}') for item in itemlist: # Find all of the potential protospacer sequences, i.e. any 21 nucleotide sequence that precedes a double g progress.update(itemlist.index(item)+1) spacerMatch = (spacer_re.findall(str(item.seq), overlapped=True) + spacer_re.findall(str(item.reverse_complement().seq), overlapped=True)) for ps in spacerMatch: # Note that ps[4:24] is the actual protospacer. I need the rest of the sequence for scoring ps_seq = Seq(ps[4:24], IUPAC.unambiguous_dna) rs = rsb.search(ps_seq) # on_target_score_calculator only works if the sequence is in uppercase # otherwise, it returns a value of 0.193313360829 for some reason score = calc_score(ps.upper()) # Get rid of anything with T(4+) as those act as RNAPIII terminators #if bool(pol3stop.findall(ps[4:24])): if "TTTT" in ps[4:24]: # TODO Should this also eliminate anything with G(4)? pass # Get rid of anything that has the verboten restriction sites elif bool([y for y in rs.values() if y != []]): pass # Eliminate potentials with a GC content <20 or >80% elif GC(ps_seq) <= 20 or GC(ps_seq) >= 80: pass elif float(score) < cutoff: # explicitly converting cutoff to a float because otherwise it 'sometimes' fails # (epecially if you set cutoff to 0.5 on the commandline) pass # keep everthing else else: if ps[4:24] not in seen: position = int(str(item.seq).find(ps)) + int(item.description.split("|")[7]) keys = ['description','position','score','spacer','offtargetscore','name'] values = [item.description, int(position), score, ps[4:24], 100] # because of duplicated sequences or whatever, spacelist can end up with duplicate entries # so let's take care of them # For future! #spacer = ProtoSpacer(description=item.description, position=int(position), score=score, spacerseq=ps[4:24], offtargetscore=100) #if ps[4:24] not in seen: # spacerset.update(spacer) # seen.append(ps[4:24]) spacerlist.append(dict(zip(keys,values))) seen.append(ps[4:24]) progress.finish() if len(spacerlist) == 0: print("Sorry, no spacers matching that criteria were found") return 0 else: print("Finished finding spacers. {} spacers found. Begining Bowtie alignment...".format(len(spacerlist))) # write out a file to pass off to Bowtie to use for off-target analysis with open('temp.fa', 'w') as tempfile: for entry in spacerlist: tempfile.writelines(">%s %s %s\n%s\n" % (entry['description'], entry['position'], entry['score'], entry['spacer'])) # delete lists we are not going to use in the future to save on some memory del(itemlist) del(seen) # Use Bowtie to find if this particular sequence has any potential off targets (i. e. two or fewer mismatches) # As current, Bowtie is set to return all everything that a particular spacer matches within the reference genome # with two or fewer mismatches. # TODO switch to Bowtie2 so we can account for gaps in mismatches # TODO can we modify this setup to account for NAG PAMs or do we even need to? (i. e. are we already?) program = 'bowtie' cpus = "-p" + str(cpu_count()) # maybe tell bowtie to stop looking after a set number of matches if reject: if largeIndex: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, '--large-index', refgenome, '-f', 'temp.fa', 'offtargets.fa']) else: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, refgenome, '-f', 'temp.fa', 'offtargets.fa']) else: if largeIndex: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '--large-index', refgenome, '-f', 'temp.fa', 'offtargets.fa']) else: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, refgenome, '-f', 'temp.fa', 'offtargets.fa']) print("Bowtie finished. Begining offtarget analysis...") bowtie_results_file = 'offtargets.fa' oftcount = 1 # This count is slow and probably unnecessary total_lines = sum(1 for line in open(bowtie_results_file)) print("Total alignments from Bowtie: {}".format(total_lines)) new_widgets = ['Scoring for off-targets. Examining: ', progressbar.Counter(), ' ', progressbar.Percentage(), ' ', progressbar.Bar(), progressbar.Timer(), progressbar.ETA()] new_progress = progressbar.ProgressBar(widgets=new_widgets, maxval=total_lines).start() prunedlist = [] # List to hold spacers whose off-target score is above the minimum cutoff mmpos = '[0-9]{1,}' mmpos_re = regex.compile(mmpos) with open(bowtie_results_file) as offtargetsfile: # parse all the bowtie results into something we can use # Read in the first line and parse. keys = ['readname', 'strand', 'position', 'mmpositions'] values = offtargetsfile.readline().strip('\n').split('\t') previous_entry = dict(zip(keys,values)) current_set_of_offtargets = [] # Read in the next line in the list. for line in offtargetsfile: line_values = line.strip('\n').split('\t') next_entry = dict(zip(keys,line_values)) new_progress.update(oftcount) oftcount += 1 if next_entry['readname'] == previous_entry['readname']:# Check to see if it is for the same spacer current_set_of_offtargets.append(next_entry) # group them together. else: # If it isn't, it is time to score that set and move to the next # Extract the off-target positions from each entry mmlist = [] badcount = 0 for entry in current_set_of_offtargets: pos = mmpos_re.findall(entry['mmpositions']) if len(pos) == 0: # Bowtie returns a match for the spacer itself in the genome # if we have two such matches, we should indicate that there is a perfect off-target badcount += 1 elif entry['strand'] == '+': mmlist.append([int(w) for w in pos]) elif entry['strand'] == '-': mmlist.append([19-int(w) for w in pos]) # Find the spacer in spacerlist to which these off-targets belong and set that spacer's offtarget score matching_spacer = next((spacer for spacer in spacerlist if (previous_entry['readname'] == '{} {} {}'.format(spacer['description'], spacer['position'],str(spacer['score'])))), None) # Tally up the offtarget score for the set of off targets and set the spacer's off-target score if badcount > 2: matching_spacer['offtargetscore'] = 0 # Speed this up by rejecting things with over a certain set of matching off-targets elif reject and len(mmlist) > reject: matching_spacer['offtargetscore'] = 0 else: matching_spacer['offtargetscore'] = sumofftargets(mmlist) if float(matching_spacer['offtargetscore']) >= float(offtargetcutoff): prunedlist.append(matching_spacer) # Dump the previous set. current_set_of_offtargets = [] # Start a new set of off-targets using this new line previous_entry = next_entry current_set_of_offtargets.append(next_entry) new_progress.finish() print("\nFinished scoring off-targets") finallist = [] if len(prunedlist) == 0: print("No spacers were found that correspond to the parameters you set.") else: if len(prunedlist[0]['description'].split('|')) == 9: # need to make sure the header format is correct for entry in prunedlist: finallist.append(FormattedResult(entry)) # Create a set of all the GeneIDs in our list geneset = set([y.GeneID for y in finallist]) toplist = [] for z in geneset: all_spacers_for_gene = [a for a in finallist if a.GeneID == z] ranked_spacers = sorted(all_spacers_for_gene, key=attrgetter('score','offtargetscore'), reverse=True) if return_limit == 'all': for w in ranked_spacers: toplist.append(w) elif len(ranked_spacers) > int(return_limit): for w in range(0,int(return_limit)): toplist.append(ranked_spacers[w]) else: for w in range(0,len(ranked_spacers)-1): toplist.append(ranked_spacers[w]) olist = map(lambda entry: [entry.GeneID, entry.GeneName, entry.seq, entry.GC, entry.position, entry.score, entry.offtargetscore], toplist) headerlist = ['GeneID', 'GeneName', 'seq','%GC','position', 'score', 'off-target score'] else: # if it isn't, just dump all the spacers we found into a file finallist = spacerlist olist = map(lambda entry: [entry.id.split(' ')[0], entry.seq, GC(entry.seq), entry.position.split(' ')[1], entry.score.split(' ')[2]], finallist) headerlist = ['ID', 'seq', '%GC','position', 'score'] print("Writing file.") try: with open(outfile, 'w') as ofile: output = csv.writer(ofile, dialect='excel') output.writerows([headerlist]) output.writerows(olist) except IOError: print("There is trouble writing to the file. Perhaps it is open in another application?") choice = input("Would you like to try again? [y/n]") if choice == 'y' or choice == 'Y': try: with open(outfile, 'w') as ofile: output = csv.writer(ofile, dialect='excel') output.writerows([headerlist]) output.writerows(olist) except: print("Sorry, still was unable to write to the file") else: return 0 print("Finished.")