Пример #1
0
def calc_crispr(chrom, seq_set):
    pos = 0
    for sgrna in window(seq_set):
        if sgrna[4:24].endswith("G"):
            score = calc_score(sgrna)
            sgline = map(str,[chrom, pos+4, pos + 24, sgrna[4:24], score, "+"])
            print('\t'.join(sgline))
        r_sgrna = ''.join([dna_dict[x] for x in sgrna])[::-1]
        if r_sgrna[4:24].endswith("G"):
            score = calc_score(r_sgrna)
            sgline = map(str,[chrom, pos+4, pos+24, r_sgrna[4:24], score, "-"])
            print('\t'.join(sgline))
        pos += 1
Пример #2
0
def calc_crispr(chrom, seq_set):
    pos = 0
    for sgrna in window(seq_set):
        if sgrna[4:24].endswith("G"):
            score = calc_score(sgrna)
            sgline = map(str,
                         [chrom, pos + 4, pos + 24, sgrna[4:24], score, "+"])
            print('\t'.join(sgline))
        r_sgrna = ''.join([dna_dict[x] for x in sgrna])[::-1]
        if r_sgrna[4:24].endswith("G"):
            score = calc_score(r_sgrna)
            sgline = map(str,
                         [chrom, pos + 4, pos + 24, r_sgrna[4:24], score, "-"])
            print('\t'.join(sgline))
        pos += 1
Пример #3
0
def find_spacers(target=None, outfile=None, refgenome=None, restriction_sites=[], largeIndex=False, cutoff=0,
                 offtargetcutoff=0, trim=False, logging=False, nuclease='Cas9', return_limit=9, reject=False):

    with open(target, 'rU') as infile:
        # Use our modified FastaIterator instead of, perhaps the more proper, SeqIO.parse() method
        # allows us to trim the UTR sequences off
        if trim:
        # If the 'trim' option is enabled, the header for each entry must be
        # GENEID | TRANSCRIPTID | EXON RANK | CONSTITUTIVE EXON | 5' UTR END | 3' UTR STOP | EXON START | EXON END
            try:
                itemiter = m_FastaIterator(infile)
            except IOError as e:
                print("I/O error({0}): {1}".format(e.errno, e.strerror))
        else:
            try:
                itemiter = SeqIO.parse(infile, 'fasta')
            except IOError as e:
                print("I/O error({0}): {1}".format(e.errno, e.strerror))
        itemlist = [temp for temp in itemiter]

    spacerlist = []

    # This will find our 20N-NGG target sequence plus the -4->-3 and +1->+3 nucleotides for scoring
    if nuclease == 'Cas9':
        PAM = r'(?i)[ACGT]{25}[G]{2}[ACGT]{3}'
    elif nuclease == 'Cpf1':
        PAM = r'(?i)[T]{2,}[A-Z]{25}'

    rsb = RestrictionBatch(restriction_sites)

    #spacerlist = map(lambda item: find_each_spacer(item, rsb, cutoff, PAM), itemlist)

    seen = []
    print("{} sequences to search for spacers.".format(len(itemlist)))
    widgets = ['Examining sequence: ', progressbar.Counter(), ' ', progressbar.Percentage(), ' ',
               progressbar.Bar(), progressbar.Timer()]
    progress = progressbar.ProgressBar(widgets=widgets, maxval=len(itemlist)).start()

    spacer_re = regex.compile(PAM)
    #pol3stop = regex.compile(r'(?i)[T]{4,}')

    for item in itemlist:
        # Find all of the potential protospacer sequences, i.e. any 21 nucleotide sequence that precedes a double g

        progress.update(itemlist.index(item)+1)

        spacerMatch = (spacer_re.findall(str(item.seq), overlapped=True) +
                       spacer_re.findall(str(item.reverse_complement().seq), overlapped=True))

        for ps in spacerMatch:
            # Note that ps[4:24] is the actual protospacer.  I need the rest of the sequence for scoring
            ps_seq = Seq(ps[4:24], IUPAC.unambiguous_dna)
            rs = rsb.search(ps_seq)
            # on_target_score_calculator only works if the sequence is in uppercase
            # otherwise, it returns a value of 0.193313360829 for some reason
            score = calc_score(ps.upper())

            # Get rid of anything with T(4+) as those act as RNAPIII terminators
            #if bool(pol3stop.findall(ps[4:24])):
            if "TTTT" in ps[4:24]:
                # TODO Should this also eliminate anything with G(4)?
                pass
            # Get rid of anything that has the verboten restriction sites
            elif bool([y for y in rs.values() if y != []]):
                pass
            # Eliminate potentials with a GC content <20 or >80%
            elif GC(ps_seq) <= 20 or GC(ps_seq) >= 80:
                pass
            elif float(score) < cutoff:
                # explicitly converting cutoff to a float because otherwise it 'sometimes' fails
                # (epecially if you set cutoff to 0.5 on the commandline)
                pass
            # keep everthing else
            else:
                if ps[4:24] not in seen:
                    position = int(str(item.seq).find(ps)) + int(item.description.split("|")[7])
                    keys = ['description','position','score','spacer','offtargetscore','name']
                    values = [item.description, int(position), score, ps[4:24], 100]
                # because of duplicated sequences or whatever, spacelist can end up with duplicate entries
                # so let's take care of them
                # For future!
                #spacer = ProtoSpacer(description=item.description, position=int(position), score=score, spacerseq=ps[4:24], offtargetscore=100)
                #if ps[4:24] not in seen:
                #    spacerset.update(spacer)
                #    seen.append(ps[4:24])
                    spacerlist.append(dict(zip(keys,values)))
                    seen.append(ps[4:24])
    progress.finish()

    if len(spacerlist) == 0:
        print("Sorry, no spacers matching that criteria were found")
        return 0
    else:
        print("Finished finding spacers.  {} spacers found.  Begining Bowtie alignment...".format(len(spacerlist)))

    # write out a file to pass off to Bowtie to use for off-target analysis
    with open('temp.fa', 'w') as tempfile:
        for entry in spacerlist:
            tempfile.writelines(">%s %s %s\n%s\n" % (entry['description'], entry['position'],
                                                     entry['score'], entry['spacer']))

    # delete lists we are not going to use in the future to save on some memory
    del(itemlist)
    del(seen)

    # Use Bowtie to find if this particular sequence has any potential off targets (i. e. two or fewer mismatches)
    # As current, Bowtie is set to return all everything that a particular spacer matches within the reference genome
    # with two or fewer mismatches.
    # TODO switch to Bowtie2 so we can account for gaps in mismatches
    # TODO can we modify this setup to account for NAG PAMs or do we even need to? (i. e. are we already?)
    program = 'bowtie'
    cpus = "-p" + str(cpu_count())
    # maybe tell bowtie to stop looking after a set number of matches

    if reject:
        if largeIndex:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, '--large-index',
                                     refgenome, '-f', 'temp.fa', 'offtargets.fa'])
        else:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, refgenome, '-f',
                                     'temp.fa', 'offtargets.fa'])
    else:
        if largeIndex:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '--large-index',
                                     refgenome, '-f', 'temp.fa', 'offtargets.fa'])
        else:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, refgenome, '-f',
                                     'temp.fa', 'offtargets.fa'])

    print("Bowtie finished.  Begining offtarget analysis...")
    bowtie_results_file = 'offtargets.fa'

    oftcount = 1
    # This count is slow and probably unnecessary
    total_lines = sum(1 for line in open(bowtie_results_file))
    print("Total alignments from Bowtie: {}".format(total_lines))
    new_widgets = ['Scoring for off-targets. Examining: ', progressbar.Counter(), ' ', progressbar.Percentage(),
                   ' ', progressbar.Bar(), progressbar.Timer(), progressbar.ETA()]
    new_progress = progressbar.ProgressBar(widgets=new_widgets, maxval=total_lines).start()

    prunedlist = [] # List to hold spacers whose off-target score is above the minimum cutoff

    mmpos = '[0-9]{1,}'
    mmpos_re = regex.compile(mmpos)

    with open(bowtie_results_file) as offtargetsfile: # parse all the bowtie results into something we can use
        # Read in the first line and parse.
        keys = ['readname', 'strand', 'position', 'mmpositions']
        values = offtargetsfile.readline().strip('\n').split('\t')
        previous_entry = dict(zip(keys,values))
        current_set_of_offtargets = []
        # Read in the next line in the list.
        for line in offtargetsfile:
            line_values = line.strip('\n').split('\t')
            next_entry = dict(zip(keys,line_values))
            new_progress.update(oftcount)
            oftcount += 1
            if next_entry['readname'] == previous_entry['readname']:# Check to see if it is for the same spacer
                current_set_of_offtargets.append(next_entry) # group them together.
            else: # If it isn't, it is time to score that set and move to the next
                # Extract the off-target positions from each entry
                mmlist = []
                badcount = 0
                for entry in current_set_of_offtargets:
                     pos = mmpos_re.findall(entry['mmpositions'])
                     if len(pos) == 0: # Bowtie returns a match for the spacer itself in the genome
                        # if we have two such matches, we should indicate that there is a perfect off-target
                        badcount += 1
                     elif entry['strand'] == '+': mmlist.append([int(w) for w in pos])
                     elif entry['strand'] == '-': mmlist.append([19-int(w) for w in pos])
                # Find the spacer in spacerlist to which these off-targets belong and set that spacer's offtarget score
                matching_spacer = next((spacer for spacer in spacerlist if (previous_entry['readname'] ==
                    '{} {} {}'.format(spacer['description'], spacer['position'],str(spacer['score'])))), None)
                # Tally up the offtarget score for the set of off targets and set the spacer's off-target score
                if badcount > 2:
                    matching_spacer['offtargetscore'] = 0
                # Speed this up by rejecting things with over a certain set of matching off-targets
                elif reject and len(mmlist) > reject:
                    matching_spacer['offtargetscore'] = 0
                else:
                    matching_spacer['offtargetscore'] = sumofftargets(mmlist)

                if float(matching_spacer['offtargetscore']) >= float(offtargetcutoff):
                    prunedlist.append(matching_spacer)
                # Dump the previous set.
                current_set_of_offtargets = []
                # Start a new set of off-targets using this new line
                previous_entry = next_entry
                current_set_of_offtargets.append(next_entry)

    new_progress.finish()

    print("\nFinished scoring off-targets")

    finallist = []
    if len(prunedlist) == 0:
        print("No spacers were found that correspond to the parameters you set.")
    else:
        if len(prunedlist[0]['description'].split('|')) == 9: # need to make sure the header format is correct
            for entry in prunedlist:
                finallist.append(FormattedResult(entry))
            # Create a set of all the GeneIDs in our list
            geneset = set([y.GeneID for y in finallist])
            toplist = []
            for z in geneset:
                all_spacers_for_gene = [a for a in finallist if a.GeneID == z]
                ranked_spacers = sorted(all_spacers_for_gene, key=attrgetter('score','offtargetscore'), reverse=True)
                if return_limit == 'all':
                    for w in ranked_spacers: toplist.append(w)
                elif len(ranked_spacers) > int(return_limit):
                    for w in range(0,int(return_limit)): toplist.append(ranked_spacers[w])
                else:
                    for w in range(0,len(ranked_spacers)-1): toplist.append(ranked_spacers[w])
            olist = map(lambda entry: [entry.GeneID, entry.GeneName, entry.seq, entry.GC, entry.position,
                                     entry.score, entry.offtargetscore], toplist)
            headerlist = ['GeneID', 'GeneName', 'seq','%GC','position', 'score', 'off-target score']
        else: # if it isn't, just dump all the spacers we found into a file
            finallist = spacerlist
            olist = map(lambda entry: [entry.id.split(' ')[0], entry.seq, GC(entry.seq), entry.position.split(' ')[1],
                                           entry.score.split(' ')[2]], finallist)
            headerlist = ['ID', 'seq', '%GC','position', 'score']

        print("Writing file.")
        try:
            with open(outfile, 'w') as ofile:
                output = csv.writer(ofile, dialect='excel')
                output.writerows([headerlist])
                output.writerows(olist)
        except IOError:
            print("There is trouble writing to the file.  Perhaps it is open in another application?")
            choice = input("Would you like to try again? [y/n]")
            if choice == 'y' or choice == 'Y':
                try:
                    with open(outfile, 'w') as ofile:
                        output = csv.writer(ofile, dialect='excel')
                        output.writerows([headerlist])
                        output.writerows(olist)
                except:
                    print("Sorry, still was unable to write to the file")
            else:
                return 0

        print("Finished.")