def fetch_restriction_sites(self, enzymes="Common"): """ Spike in target variant first, generate list restriction enzymes that will work. """ if enzymes == "ALL": enzyme_group = AllEnzymes elif enzymes == "Common": enzyme_group = CommOnly elif enzymes == "HF": enzyme_group = high_fidelity else: enzyme_group = RestrictionBatch(enzymes.split(",")) # Filter ambiguous cutters enzyme_group = RestrictionBatch( [x for x in enzyme_group if x.is_ambiguous() is False]) # Calculate rflps for ALT sites only self.ref_sites = dict(list(enzyme_group.search(self.ref_seq).items())) self.primary_variant_sites = dict( list(enzyme_group.search(self.primary_variant_seq).items())) self.rflps = { k: (self.ref_sites[k], self.primary_variant_sites[k]) for k, v in list(self.ref_sites.items()) if len(v) > 0 and len(v) <= 3 and self.ref_sites[k] != self.primary_variant_sites[k] }
def digest_genome(genome_fp, restriction_enzyme, output_dir, linear=False): base_fp = os.path.basename(genome_fp) if '.' in base_fp: base_fp = '{}.{}.fragments.bed'.format(base_fp[:base_fp.rfind('.')], restriction_enzyme) else: base_fp = '{}.{}.fragments.bed'.format(base_fp, restriction_enzyme) base_fp = os.path.join(output_dir, base_fp) if os.path.isfile(base_fp): overwrite = input( 'WARNING: Overwriting existing fragment BED {}. Continue? [y/N]'. format(base_fp)) if not overwrite.lower() == 'y': print("Did not overwrite existing fragment BED.") return os.remove(base_fp) print("Digesting") genome = None if "fasta" in genome_fp or "fa" in genome_fp: genome = SeqIO.parse(open(genome_fp, "rU"), format='fasta') else: genome = SeqIO.parse(open(genome_fp, "rU"), format='genbank') for chromosome in genome: print('{}\t{}'.format(chromosome.id, len(chromosome.seq))) # Digest the sequence data and return the cut points enzyme = RestrictionBatch([restriction_enzyme]) for enzyme, cutpoints in enzyme.search(chromosome.seq, linear=linear).items(): if len(cutpoints) == 0: print('No restriction sites found for {}'.format( chromosome.id)) continue df = pd.DataFrame(cutpoints, columns=['cutpoint']) df['end'] = df.cutpoint - 1 df['start'] = df.end - (df.cutpoint.diff()) df.loc[0, 'start'] = 0 df['start'] = df['start'].astype('Int64') if len(df) > 1: last_fragment = pd.DataFrame({ 'start': [df.loc[len(df) - 1, 'end']], 'end': [len(chromosome.seq)], 'cutpoint': [-1] }) df = df.append(last_fragment, ignore_index=True) else: df.loc[len(df) - 1, 'end'] = len(chromosome.seq) df['frag_id'] = df.index # chromosome has 'chr' accession = chromosome.id version = '' if "." in chromosome.id: accession, version = chromosome.id.split(".") if not accession.startswith("chr"): accession = "chr" + accession df['chr'] = accession df[['chr', 'start', 'end', 'frag_id']].to_csv(base_fp, index=False, sep='\t', mode='a', header=None)
def REsearch(goi='', goiFile='', mcs='', mcsFile=''): rb = RestrictionBatch(suppliers=[ 'C', 'B', 'E', 'I', 'K', 'J', 'M', 'O', 'N', 'Q', 'S', 'R', 'V', 'Y', 'X' ]) goi = Seq(goi, IUPACUnambiguousDNA()) if goi else read_seq(goiFile) if not goi: raise Exception('Please provide a GOI sequence!') mcs = Seq(mcs, IUPACUnambiguousDNA()) if mcs else read_seq(mcsFile) if not mcs: raise Exception('Please provide a MCS sequence!') result_mcs = rb.search(mcs) result_goi = rb.search(goi) REs = set([e for e in result_mcs.keys() if result_mcs[e]]) - set( [e for e in result_goi.keys() if result_goi[e]]) # ana = Analysis(RestrictionBatch(list(REs)), mcs) # REs_sorted = sorted(REs, key=lambda e: result_mcs[e]) # result = {e: result_mcs[e] for e in REs_sorted} r = [] for e in REs: for site in result_mcs[e]: r.append((str(e), site, "blunt" if e.is_blunt() else e.elucidate(), ' '.join(e.suppl))) r.sort(key=lambda i: i[1]) return r
def has_restriction_site(seq): from Bio.Seq import Seq from Bio.Restriction import RestrictionBatch mix = RestrictionBatch(restriction_sites) hits = mix.search(Seq(seq)) return any(hits.values())
def test_batch_analysis(self): """Sequence analysis with a restriction batch.""" seq = Seq("AAAA" + EcoRV.site + "AAAA" + EcoRI.site + "AAAA") batch = RestrictionBatch([EcoRV, EcoRI]) hits = batch.search(seq) self.assertEqual(hits[EcoRV], [8]) self.assertEqual(hits[EcoRI], [16])
def test_batch_analysis(self): """Sequence analysis with a restriction batch.""" seq = Seq("AAAA" + EcoRV.site + "AAAA" + EcoRI.site + "AAAA", IUPACAmbiguousDNA()) batch = RestrictionBatch([EcoRV, EcoRI]) hits = batch.search(seq) self.assertEqual(hits[EcoRV], [8]) self.assertEqual(hits[EcoRI], [16])
def re_sites(self, sequence): seq = Seq(sequence, IUPACAmbiguousDNA) # Set up analysis class with our enzymes and seq rb = RestrictionBatch(self.enzyme_set) # Do digest and reformat to dict of {site: enz, site:enz} re_sites = {} for enzyme, cutsites in rb.search(seq).items(): for cut in cutsites: cut = cut + enzyme.fst3 - 1 re_sites[cut] = enzyme return sorted(re_sites.items())
def restriction_sites_present(spacer: str, rsb: RestrictionBatch) -> List[int]: """Determine if and where a set of restriction sites are present in a sequence\f Parameters ---------- spacer : `str` Spacer sequence to examine for restriction sites. Returns ------- :class:`typing.List`[`int`] """ sites = bool([_ for results in rsb.search(Seq(spacer)).values() for _ in results]) return sites
def calc_digest_products(seq, enzymes, *, is_circular): from more_itertools import pairwise, flatten from Bio.Restriction import RestrictionBatch from Bio.Seq import Seq if not enzymes: raise UsageError("no enzymes specified", enzymes=enzymes) enzymes = [re.sub('-HF(v2)?$', '', x) for x in enzymes] try: batch = RestrictionBatch(enzymes) except ValueError: raise ConfigError( lambda e: f"unknown enzyme(s): {','.join(map(repr, e.enzymes))}", enzymes=enzymes, ) from None sites = [x - 1 for x in flatten(batch.search(Seq(seq)).values())] if not sites: raise ConfigError( lambda e: f"{','.join(map(repr, e.enzymes))} {plural(enzymes):/does/do} not cut template.", enzymes=enzymes, seq=seq, ) sites += [] if is_circular else [0, len(seq)] sites = sorted(sites) seqs = [] for i, j in pairwise(sorted(sites)): seqs.append(seq[i:j]) if is_circular: wrap_around = seq[sites[-1]:] + seq[:sites[0]] seqs.append(wrap_around) return seqs
def find_spacers(target=None, outfile=None, refgenome=None, restriction_sites=[], largeIndex=False, cutoff=0, offtargetcutoff=0, trim=False, logging=False, nuclease='Cas9', return_limit=9, reject=False): with open(target, 'rU') as infile: # Use our modified FastaIterator instead of, perhaps the more proper, SeqIO.parse() method # allows us to trim the UTR sequences off if trim: # If the 'trim' option is enabled, the header for each entry must be # GENEID | TRANSCRIPTID | EXON RANK | CONSTITUTIVE EXON | 5' UTR END | 3' UTR STOP | EXON START | EXON END try: itemiter = m_FastaIterator(infile) except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) else: try: itemiter = SeqIO.parse(infile, 'fasta') except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) itemlist = [temp for temp in itemiter] spacerlist = [] # This will find our 20N-NGG target sequence plus the -4->-3 and +1->+3 nucleotides for scoring if nuclease == 'Cas9': PAM = r'(?i)[ACGT]{25}[G]{2}[ACGT]{3}' elif nuclease == 'Cpf1': PAM = r'(?i)[T]{2,}[A-Z]{25}' rsb = RestrictionBatch(restriction_sites) #spacerlist = map(lambda item: find_each_spacer(item, rsb, cutoff, PAM), itemlist) seen = [] print("{} sequences to search for spacers.".format(len(itemlist))) widgets = ['Examining sequence: ', progressbar.Counter(), ' ', progressbar.Percentage(), ' ', progressbar.Bar(), progressbar.Timer()] progress = progressbar.ProgressBar(widgets=widgets, maxval=len(itemlist)).start() spacer_re = regex.compile(PAM) #pol3stop = regex.compile(r'(?i)[T]{4,}') for item in itemlist: # Find all of the potential protospacer sequences, i.e. any 21 nucleotide sequence that precedes a double g progress.update(itemlist.index(item)+1) spacerMatch = (spacer_re.findall(str(item.seq), overlapped=True) + spacer_re.findall(str(item.reverse_complement().seq), overlapped=True)) for ps in spacerMatch: # Note that ps[4:24] is the actual protospacer. I need the rest of the sequence for scoring ps_seq = Seq(ps[4:24], IUPAC.unambiguous_dna) rs = rsb.search(ps_seq) # on_target_score_calculator only works if the sequence is in uppercase # otherwise, it returns a value of 0.193313360829 for some reason score = calc_score(ps.upper()) # Get rid of anything with T(4+) as those act as RNAPIII terminators #if bool(pol3stop.findall(ps[4:24])): if "TTTT" in ps[4:24]: # TODO Should this also eliminate anything with G(4)? pass # Get rid of anything that has the verboten restriction sites elif bool([y for y in rs.values() if y != []]): pass # Eliminate potentials with a GC content <20 or >80% elif GC(ps_seq) <= 20 or GC(ps_seq) >= 80: pass elif float(score) < cutoff: # explicitly converting cutoff to a float because otherwise it 'sometimes' fails # (epecially if you set cutoff to 0.5 on the commandline) pass # keep everthing else else: if ps[4:24] not in seen: position = int(str(item.seq).find(ps)) + int(item.description.split("|")[7]) keys = ['description','position','score','spacer','offtargetscore','name'] values = [item.description, int(position), score, ps[4:24], 100] # because of duplicated sequences or whatever, spacelist can end up with duplicate entries # so let's take care of them # For future! #spacer = ProtoSpacer(description=item.description, position=int(position), score=score, spacerseq=ps[4:24], offtargetscore=100) #if ps[4:24] not in seen: # spacerset.update(spacer) # seen.append(ps[4:24]) spacerlist.append(dict(zip(keys,values))) seen.append(ps[4:24]) progress.finish() if len(spacerlist) == 0: print("Sorry, no spacers matching that criteria were found") return 0 else: print("Finished finding spacers. {} spacers found. Begining Bowtie alignment...".format(len(spacerlist))) # write out a file to pass off to Bowtie to use for off-target analysis with open('temp.fa', 'w') as tempfile: for entry in spacerlist: tempfile.writelines(">%s %s %s\n%s\n" % (entry['description'], entry['position'], entry['score'], entry['spacer'])) # delete lists we are not going to use in the future to save on some memory del(itemlist) del(seen) # Use Bowtie to find if this particular sequence has any potential off targets (i. e. two or fewer mismatches) # As current, Bowtie is set to return all everything that a particular spacer matches within the reference genome # with two or fewer mismatches. # TODO switch to Bowtie2 so we can account for gaps in mismatches # TODO can we modify this setup to account for NAG PAMs or do we even need to? (i. e. are we already?) program = 'bowtie' cpus = "-p" + str(cpu_count()) # maybe tell bowtie to stop looking after a set number of matches if reject: if largeIndex: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, '--large-index', refgenome, '-f', 'temp.fa', 'offtargets.fa']) else: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, refgenome, '-f', 'temp.fa', 'offtargets.fa']) else: if largeIndex: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '--large-index', refgenome, '-f', 'temp.fa', 'offtargets.fa']) else: subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, refgenome, '-f', 'temp.fa', 'offtargets.fa']) print("Bowtie finished. Begining offtarget analysis...") bowtie_results_file = 'offtargets.fa' oftcount = 1 # This count is slow and probably unnecessary total_lines = sum(1 for line in open(bowtie_results_file)) print("Total alignments from Bowtie: {}".format(total_lines)) new_widgets = ['Scoring for off-targets. Examining: ', progressbar.Counter(), ' ', progressbar.Percentage(), ' ', progressbar.Bar(), progressbar.Timer(), progressbar.ETA()] new_progress = progressbar.ProgressBar(widgets=new_widgets, maxval=total_lines).start() prunedlist = [] # List to hold spacers whose off-target score is above the minimum cutoff mmpos = '[0-9]{1,}' mmpos_re = regex.compile(mmpos) with open(bowtie_results_file) as offtargetsfile: # parse all the bowtie results into something we can use # Read in the first line and parse. keys = ['readname', 'strand', 'position', 'mmpositions'] values = offtargetsfile.readline().strip('\n').split('\t') previous_entry = dict(zip(keys,values)) current_set_of_offtargets = [] # Read in the next line in the list. for line in offtargetsfile: line_values = line.strip('\n').split('\t') next_entry = dict(zip(keys,line_values)) new_progress.update(oftcount) oftcount += 1 if next_entry['readname'] == previous_entry['readname']:# Check to see if it is for the same spacer current_set_of_offtargets.append(next_entry) # group them together. else: # If it isn't, it is time to score that set and move to the next # Extract the off-target positions from each entry mmlist = [] badcount = 0 for entry in current_set_of_offtargets: pos = mmpos_re.findall(entry['mmpositions']) if len(pos) == 0: # Bowtie returns a match for the spacer itself in the genome # if we have two such matches, we should indicate that there is a perfect off-target badcount += 1 elif entry['strand'] == '+': mmlist.append([int(w) for w in pos]) elif entry['strand'] == '-': mmlist.append([19-int(w) for w in pos]) # Find the spacer in spacerlist to which these off-targets belong and set that spacer's offtarget score matching_spacer = next((spacer for spacer in spacerlist if (previous_entry['readname'] == '{} {} {}'.format(spacer['description'], spacer['position'],str(spacer['score'])))), None) # Tally up the offtarget score for the set of off targets and set the spacer's off-target score if badcount > 2: matching_spacer['offtargetscore'] = 0 # Speed this up by rejecting things with over a certain set of matching off-targets elif reject and len(mmlist) > reject: matching_spacer['offtargetscore'] = 0 else: matching_spacer['offtargetscore'] = sumofftargets(mmlist) if float(matching_spacer['offtargetscore']) >= float(offtargetcutoff): prunedlist.append(matching_spacer) # Dump the previous set. current_set_of_offtargets = [] # Start a new set of off-targets using this new line previous_entry = next_entry current_set_of_offtargets.append(next_entry) new_progress.finish() print("\nFinished scoring off-targets") finallist = [] if len(prunedlist) == 0: print("No spacers were found that correspond to the parameters you set.") else: if len(prunedlist[0]['description'].split('|')) == 9: # need to make sure the header format is correct for entry in prunedlist: finallist.append(FormattedResult(entry)) # Create a set of all the GeneIDs in our list geneset = set([y.GeneID for y in finallist]) toplist = [] for z in geneset: all_spacers_for_gene = [a for a in finallist if a.GeneID == z] ranked_spacers = sorted(all_spacers_for_gene, key=attrgetter('score','offtargetscore'), reverse=True) if return_limit == 'all': for w in ranked_spacers: toplist.append(w) elif len(ranked_spacers) > int(return_limit): for w in range(0,int(return_limit)): toplist.append(ranked_spacers[w]) else: for w in range(0,len(ranked_spacers)-1): toplist.append(ranked_spacers[w]) olist = map(lambda entry: [entry.GeneID, entry.GeneName, entry.seq, entry.GC, entry.position, entry.score, entry.offtargetscore], toplist) headerlist = ['GeneID', 'GeneName', 'seq','%GC','position', 'score', 'off-target score'] else: # if it isn't, just dump all the spacers we found into a file finallist = spacerlist olist = map(lambda entry: [entry.id.split(' ')[0], entry.seq, GC(entry.seq), entry.position.split(' ')[1], entry.score.split(' ')[2]], finallist) headerlist = ['ID', 'seq', '%GC','position', 'score'] print("Writing file.") try: with open(outfile, 'w') as ofile: output = csv.writer(ofile, dialect='excel') output.writerows([headerlist]) output.writerows(olist) except IOError: print("There is trouble writing to the file. Perhaps it is open in another application?") choice = input("Would you like to try again? [y/n]") if choice == 'y' or choice == 'Y': try: with open(outfile, 'w') as ofile: output = csv.writer(ofile, dialect='excel') output.writerows([headerlist]) output.writerows(olist) except: print("Sorry, still was unable to write to the file") else: return 0 print("Finished.")
class DigestedSequence: def __init__(self, enzymes, sequence, is_linear=True): self.enzymes = enzymes self.sequence = sequence self.res_batch = RestrictionBatch(enzymes) self.is_linear = is_linear self.site_dict = self.res_batch.search(self.sequence.seq, is_linear) def get_sites(self): """ Return the set of sites for a given contig, ordered by increasing position. :return: list of CutSites """ cutSites = [] for e_name, ctg_locs in self.site_dict.iteritems(): for loc in ctg_locs: cutSites.append(CutSite(e_name, loc)) return sorted(cutSites) def get_fragments(self): """ Return the genomic fragments resulting from the digestion. :return: list of SeqRecords. """ sites = self.get_sites() seq = self.sequence if len(sites) == 0: return seq frags = [] for idx in xrange(1, len(sites)): a = sites[idx - 1] b = sites[idx] frg = seq[a:b] frg.id = "{0}:{1}:{2}".format(frg.id, a, b) frg.name = frg.id frg.description = "restriction digest fragment from {0} to {1}".format(a, b) frags.append(frg) return frags @staticmethod def digestion_sites(seq_list, enzyme_names=[], min_sites=1): """ Return a list of sites per sequence, preserving the input list order. :param seq_list: list of sequences to analyze :param enzyme_names: enzyme used in digestion :param min_sites: minimum sites required for a sequence to be included. :param min_length: minimum sequence length to be included. :return: list of sites per sequence """ sites = [] for seq in seq_list: if seq["excluded"]: continue ds = DigestedSequence(enzyme_names, seq["record"]) seq_sites = ds.get_sites() if len(seq_sites) < min_sites: print "\tExcluded {0} (length {1}) with only {2} sites".format( seq["record"].id, len(seq["record"]), len(seq_sites) ) # continue sites.append({"name": seq["record"].id, "pos": seq_sites}) return sites
# # Created: 11/04/2013 #------------------------------------------------------------------------------- from Bio import Entrez from Bio import SeqIO from Bio.Restriction import Restriction from Bio.Restriction import RestrictionBatch email="*****@*****.**" Entrez.email = email fetch_seq = Entrez.efetch(db="nucleotide", rettype="fasta",retmode="text", id="294489415") seq_record = SeqIO.read(fetch_seq, "fasta") fetch_seq.close() # To see how a specific restriction enzyme (Sau3AI) would digest your sequence: print("Restriction site is", Restriction.Sau3AI.site) digest = Restriction.Sau3AI.catalyse(seq_record.seq) print ("Number of fragments is", len(digest)) print "------\nLengths of each fragment\n------" for lengths in digest: print len(lengths) # Run every restriction enzyme in the New England Biolabs database against the sequence rb_supp = RestrictionBatch(first=[], suppliers=['N']) for rest in rb_supp.search(seq_record.seq): # The code commented out below will only show restriction enzymes that created a number of fragments between 10 and 40 #if len(rb_supp.search(seq_record.seq)[rest]) > 10 and len(rb_supp.search(seq_record.seq)[rest]) < 40: # This will show every restriction ezyme that was able to digest the sequence in some way. if len(rb_supp.search(seq_record.seq)[rest]) > 0: print rest, ":", rb_supp.search(seq_record.seq)[rest]
class DigestedSequence: def __init__(self, enzymes, sequence, is_linear=True): self.enzymes = enzymes self.sequence = sequence self.res_batch = RestrictionBatch(enzymes) self.is_linear = is_linear self.site_dict = self.res_batch.search(self.sequence.seq, is_linear) def get_sites(self): """ Return the set of sites for a given contig, ordered by increasing position. :return: list of CutSites """ cutSites = [] for e_name, ctg_locs in self.site_dict.iteritems(): for loc in ctg_locs: cutSites.append(CutSite(e_name, loc)) return sorted(cutSites) def get_fragments(self): """ Return the genomic fragments resulting from the digestion. :return: list of SeqRecords. """ sites = self.get_sites() seq = self.sequence if len(sites) == 0: return seq frags = [] for idx in xrange(1, len(sites)): a = sites[idx - 1] b = sites[idx] frg = seq[a:b] frg.id = '{0}:{1}:{2}'.format(frg.id, a, b) frg.name = frg.id frg.description = 'restriction digest fragment from {0} to {1}'.format( a, b) frags.append(frg) return frags @staticmethod def digestion_sites(seq_list, enzyme_names=[], min_sites=1): """ Return a list of sites per sequence, preserving the input list order. :param seq_list: list of sequences to analyze :param enzyme_names: enzyme used in digestion :param min_sites: minimum sites required for a sequence to be included. :param min_length: minimum sequence length to be included. :return: list of sites per sequence """ sites = [] for seq in seq_list: if seq['excluded']: continue ds = DigestedSequence(enzyme_names, seq['record']) seq_sites = ds.get_sites() if len(seq_sites) < min_sites: print '\tExcluded {0} (length {1}) with only {2} sites'.format( seq['record'].id, len(seq['record']), len(seq_sites)) #continue sites.append({'name': seq['record'].id, 'pos': seq_sites}) return sites
def _catalyze(record: SeqRecord, enzymes: List[RestrictionType], linear=True) -> List[Tuple[str, SeqRecord, str]]: """Catalyze a SeqRecord and return all post-digest SeqRecords with overhangs. Overhangs are returned as the overhang plus the position of the cut in the 5' end (^) and 3' end (_). So a 5' overhang may be: ^AAAA_. But a 3' overhang may be: _AAAA^. Args: record: The SeqRecord to digest with enzymes enzymes: List of enzymes to digest the input records with Keyword Args: linear: Whether the record to catalyze is linear or circular Returns: Tuple with: (left overhang, cut fragment, right overhang) """ record = record.upper() batch = RestrictionBatch(enzymes) batch_sites = batch.search(record.seq, linear=linear) # order all cuts with enzymes based on index cuts_seen: Set[int] = set() enzyme_cuts: List[Tuple[RestrictionType, int]] = [] for enzyme, cuts in batch_sites.items(): for cut in cuts: if cut in cuts_seen: continue cuts_seen.add(cut) enzyme_cuts.append((enzyme, cut - 1)) # revert to 0-based enzyme_cuts = sorted(enzyme_cuts, key=lambda x: x[1]) # list of left/right overhangs for each fragment frag_w_overhangs: List[Tuple[str, SeqRecord, str]] = [] for i, (enzyme, cut) in enumerate(enzyme_cuts): if i == len(enzyme_cuts) - 1 and linear: continue next_enzyme, next_cut = enzyme_cuts[(i + 1) % len(enzyme_cuts)] enzyme_len = len(enzyme.ovhgseq) next_enzyme_len = len(next_enzyme.ovhgseq) # shift cuts left for 3overhang enzymes if enzyme.is_3overhang(): cut -= enzyme_len if next_enzyme.is_3overhang(): next_cut -= next_enzyme_len cut_rc = cut if enzyme.is_3overhang() else cut + enzyme_len next_cut_rc = (next_cut if next_enzyme.is_3overhang() else next_cut + next_enzyme_len) # find the cutsite sequences left = record[cut:cut + enzyme_len] right = record[next_cut:next_cut + next_enzyme_len] left_rc = right.reverse_complement() right_rc = left.reverse_complement() left = str(left.seq) right = str(right.seq) left_rc = str(left_rc.seq) right_rc = str(right_rc.seq) if enzyme.is_3overhang(): left += "^" right_rc += "^" else: left = "^" + left right_rc = "^" + right_rc if next_enzyme.is_3overhang(): right += "^" left_rc += "^" else: right = "^" + right left_rc = "^" + left_rc # shift cuts right again for 3overhang enzymes if enzyme.is_3overhang(): cut += enzyme_len if next_enzyme.is_3overhang(): next_cut += next_enzyme_len frag = record[cut:next_cut] frag_rc = record[cut_rc:next_cut_rc].reverse_complement() frag_rc.id = record.id if next_cut < cut: # wraps around the zero-index frag = (record + record)[cut:next_cut + len(record)] frag.id = record.id frag_rc = (record + record)[cut_rc:next_cut_rc + len(record)].reverse_complement() frag_rc.id = record.id frag_w_overhangs.append((left, frag, right)) frag_w_overhangs.append((left_rc, frag_rc, right_rc)) return frag_w_overhangs
def redigest_code(): argscheck() ### making outfile # output file if entry2_input.get() == "": outfile = 'redigest.'+ TIME + '.out' else: outfile=entry2_input.get() out_file = open(outfile, 'wt+') ### processing gene sequences if entry7_input.get() == "Multifasta gene file": genomeSeq = "N" elif entry7_input.get() == "Single genome sequence": genomeSeq = "Y" if genomeSeq == 'N': ### making report file verbosity = 'Y' report_file = outfile + '.csv' # open report file RF = open(report_file, 'wt+') # reverse complement the reverse primer if entry6_input.get() == "": reverse = "" else: reverse = entry6_input.get() reverse=str(Seq(reverse).reverse_complement()) # counter for the NAME count=1 ### iterating sequences input_file = entry1_input.get() infile= open(input_file, 'r') # input file format if entry8_input.get() == "Fasta": informat = "fasta" elif entry8_input.get() == "Genbank": informat = "genbank" for record in SeqIO.parse(infile, informat): header=record.id array=str(record.seq) NAME=str('RED' + TIME + str(count)) # if informat == 'genbank': desc=str(', '.join(list(record.annotations["taxonomy"]))) else: desc = '' ## adding primer sequence if provided if entry5_input.get() == "": forward = "" else: forward = entry5_input.get() if forward is not None: Farray=''.join(forward + array) else: Farray=array ### adding primer sequence if provided if entry6_input.get() == "": reverse = "" else: reverse = entry6_input.get() if reverse is not None: FarrayR=''.join(Farray.strip('\n') + reverse) else: FarrayR=Farray ### orientation based on tagged if entry4_input.get() == "Forward": tagg = str("F") elif entry4_input.get() == "Reverse": tagg = str("R") tagged=tagg.upper() if tagged == 'R': FarrayRseqFR=str(Seq(FarrayR).reverse_complement()) SubFeat="TRF_RevComp" else: FarrayRseqFR=str(FarrayR) SubFeat="TRF" ### Restriction Enzyme check from list enzyme = entry3_input.get() enzyme_RE = RestrictionBatch([enzyme]) ### search the restriction sites position in sequence FarrayRseqFR_RE=enzyme_RE.search(Seq(FarrayRseqFR)) ### convert the dict to the list and indexing index=list(FarrayRseqFR_RE.values())[0] ### checking if restriction site is present or sequence will be uncut if not index: fragment=len(FarrayRseqFR) else: fragment=index[0] ### adding size to header and trimming sequence to terminal fragment length if not index: ### non-cut fragment header FastaHeader=NAME + "|" + str(len(FarrayRseqFR)) + "_bp" + "|" + header ### non-cut fragment sequence FastaSeq=FarrayRseqFR[:len(FarrayRseqFR)] Feat=SeqFeature(FeatureLocation(start=0, end=len(FarrayRseqFR)), type="REDigest", ref=SubFeat) else: ### cut fragment header FastaHeader=NAME + "|" + str(fragment) + "_bp" + "|" + header ### cut fragment sequence and slicing to the fragment length FastaSeq=FarrayRseqFR[:fragment] Feat=SeqFeature(FeatureLocation(start=0, end=fragment), type="REDigest", ref=SubFeat) ### terminal-screen output, info about sequence header and all the fragments ### based of verbosity if verbosity == 'Y': print(" ", FastaHeader, '\t', FarrayRseqFR_RE) ### counter for the locus name count +=1 ### seq object FastaSequence = SeqRecord(Seq(FastaSeq, IUPAC.IUPACAmbiguousDNA()), FastaHeader, description=desc, name=NAME) ### append features to seqobject FastaSequence.features.append(Feat) ### seq object if entry9_input.get() == "Fasta": outformat = "fasta" elif entry9_input.get() == "Genbank": outformat = "genbank" if outformat == 'genbank': SeqIO.write(FastaSequence, out_file, outformat) else: SeqIO.write(FastaSequence, out_file, "fasta-2line") ### writing progress to file too print(FastaHeader, '\t', FarrayRseqFR_RE, file=RF) ##################### ############################################################### Genome else: ### parsing genome sequence ### making report file if entry9_input.get() == "Fasta": outformat = "fasta" elif entry9_input.get() == "Genbank": outformat = "genbank" if entry2_input.get() == "": outfile = 'redigest.'+ TIME + '.out' else: outfile=entry2_input.get() report_file = outfile + '_RF.csv' # open report file RF = open(report_file, 'wt+') print("Individual restriction fragments", file=RF) print("[WRITING:] Individual restriction fragments to file:", report_file) input_file = entry1_input.get() infile= open(input_file, 'r') # input file format if entry8_input.get() == "Fasta": informat = "fasta" elif entry8_input.get() == "Genbank": informat = "genbank" for record in SeqIO.parse(infile, informat): Gen_header=record.id Gen_array=str(record.seq) # if informat == 'genbank': desc=str(', '.join(list(record.annotations["taxonomy"]))) else: desc = '' ### Restriction Enzyme check from list enzyme = entry3_input.get() enzyme_RE = RestrictionBatch([enzyme]) ### search the restriction sites position in sequence Gen_array_RE=enzyme_RE.search(Seq(Gen_array)) Gen_array_RE_V = list(Gen_array_RE.values())[0] # ID0 = 0 ID_min = min(Gen_array_RE_V) ID_max = max(Gen_array_RE_V) ID1 = 0 ID2 = 1 # first fragment from first nt to first cut GenFastaSeq=Gen_array[0:ID_min] GenFastaSeqLen = len(GenFastaSeq) GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header # verbosity verbosity = "Y" if verbosity == 'Y': print(" ", GenFastaHeader) # seq object GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc) ### seq object to file if outformat == 'genbank': SeqIO.write(GenSeqRec, out_file, outformat) elif outformat == 'fasta': SeqIO.write(GenSeqRec, out_file, outformat) # report to file print(GenFastaHeader, file=RF) # for GenomeFragment in Gen_array_RE_V: while ID2 < len(Gen_array_RE_V): GenFastaSeq=Gen_array[Gen_array_RE_V[ID1]:Gen_array_RE_V[ID2]] GenFastaSeqLen = len(GenFastaSeq) GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header # verbosity if verbosity == 'Y': print(" ", GenFastaHeader) # increment value for index ID1 += 1 ID2 += 1 # seq object GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc) ### seq object to file if outformat == 'genbank': SeqIO.write(GenSeqRec, out_file, outformat) elif outformat == 'fasta': SeqIO.write(GenSeqRec, out_file, outformat) # report to file print(GenFastaHeader, file=RF) # last fragment from last cut to last nt GenFastaSeq=Gen_array[ID_max:] GenFastaSeqLen = len(GenFastaSeq) GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header # verbosity if verbosity == 'Y': print(" ", GenFastaHeader) # seq object GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc) ### seq object to file if outformat == 'genbank': SeqIO.write(GenSeqRec, out_file, outformat) else: SeqIO.write(GenSeqRec, out_file, outformat) # report to file print(GenFastaHeader, file=RF) # getting all sequences from first nt to respective cut report_file2 = outfile + '_TRF.csv' # open report file TRF2 = open(report_file2, 'wt+') print("Terminal restriction fragments: from nucleotide 1 to respective cuts", file=TRF2) print("[WRITING:] Terminal restriction fragments, from nucleotide 1 to respective cuts to file:", report_file2) for GenomeFragment in Gen_array_RE_V: GenFastaHeader=Gen_header + "|" + str(GenomeFragment) + "_bp|" + Gen_header GenFastaSeq=Gen_array[:GenomeFragment] ### seq object GenFastaSequence = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc) ### terminal-screen output, info about sequence header and all the fragments ### based of verbosity if verbosity == 'Y': print(" ", GenFastaHeader) ### writing progress to file too print(GenFastaHeader, file=TRF2) ## close files TRF2.close() RF.close() # final close out_file.close() infile.close()