def rc_regions(gb, choice='whole'): """ Reverse and complement given region of sequence. Args: gb(Path or str): rotate_seq generated gb file choice(str): region to be processed, must be in 'LSC', 'IRa', 'SSC', 'IRb', 'whole'. Return: new_file(Path): reverse-complemented fasta """ # choices = ('LSC', 'IRa', 'SSC', 'IRb', 'whole') raw = SeqIO.read(gb, 'gb') data = {} new_seq = '' regions = get_regions(gb) for r in regions: data[r] = regions[r].extract(raw).seq if choice != 'whole': data[choice] = rc(regions[choice].extract(raw.seq)) new_seq = data['LSC'] for i in ['IRa', 'SSC', 'IRb']: new_seq += data[i] else: new_seq = rc(raw.seq) new_name = '_RC_' + raw.name new_file = gb.with_suffix('.rc.rc') with open(new_file, 'w') as _: _.write(f'>{new_name}\n') _.write(f'{new_seq}\n') return new_file
def main(): usage = "usage: %prog [options]" description = "blah blah blah" optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") optparser.add_option("-i","--ifile",dest="ifile",type="string", help="input files, if you give a pattern for files, please use \" to surround the pattern string") optparser.add_option("-w","--window",dest="window",type="int", default=1000,help="window to extract a sample sequence, default:1000") optparser.add_option("-s","--sample",dest="sample",type="int", default=36,help="size of a sample sequence,default=36") optparser.add_option("-f","--frag",dest="frag",type="int", help="if pair-end calculation is needed, give the fragment size") optparser.add_option("-o","--ofile",dest="ofile", help="output file") (options,args) = optparser.parse_args() if not options.ifile or not options.window or not options.sample or not options.ofile: optparser.print_help() sys.exit(1) ohd = open(options.ofile,"w") files = glob(options.ifile) if not files: sys.stderr.write("no file found: %s\n" % (options.ifile)) sys.exit(1) m = 0 for f in files: sys.stdout.write("%s ... \n" % (f)) sys.stdout.flush() fahd = open(f,"r") record = SeqIO.parse(fahd,"fasta") for i in record: n = len(i.seq)/1000 for l in range(0,n-1): s = l*1000+randint(0,999) if options.frag: slice = i.seq[s:s+options.frag].tostring().upper() if slice.find("N") == -1: m+=1 ohd.write("%s\t%d\n" % (slice[:options.sample],m)) m+=1 ohd.write("%s\t%d\n" % (rc(slice[-1*options.sample:]),m)) #ohd.write("> slice%d_left\n%s\n" % (m,slice[:options.sample])) #ohd.write("> slice%d_right\n%s\n" % (m,rc(slice[-1*options.sample:]))) else: slice = i.seq[s:s+options.sample].tostring().upper() if slice.find("N") == -1: m+=1 ohd.write("%s\t%d\n" % (slice,m)) #ohd.write("> slice%d\n%s\n" % (m,slice)) fahd.close() ohd.close()
def cseguid(seq): '''Returns the cSEGUID for the sequence. The cSEGUID is the url safe SEGUID checksum calculated for the lexicographically minimal string rotation of a DNA sequence. Only defined for circular sequences. ''' from Bio.Seq import reverse_complement as rc return pretty_string( seguid( min(SmallestRotation(str(seq).upper()), SmallestRotation(str(rc(seq)).upper()))))
def _rc(record): if isinstance(record, str): return rc(record) elif isinstance(record, Seq): return record.reverse_complement() elif isinstance(record, SeqRecord): return SeqRecord(record.seq.reverse_complement(), id=record.id, name=record.name, description=record.description) else: raise ValueError( 'record must be one of str, Bio.Seq, or Bio.SeqRecord')
def primer_dict(self): if self._primer_dict is None: from Bio.Seq import reverse_complement as rc self._primer_dict = {'F1': ['CTCAATAAAGCTTGCCTTGAGTGC', rc('ACTGTATCATCTGCTCCTGTRTCT')], 'F2': ['AAATTGCAGGGCYCCTAG', rc('CTRTTAGCTGCCCCATCTACATAG')], 'F3B': ['CACACTAATGATGTAARACARTTAACAG', rc('GGGATGTGTACTTCTGAACTTAYTYTTGG')], 'F4': ['CGGGTTTATTWCAGRGACAGCAGA', rc('GGGGTTAAYTTTACACATGGYTTTA')], 'F5a': ['GGCATYTCCTATGGCAGGAAGAAG', rc('GTGGTGCARATGAGTTTTCCAGAGCA')], 'F6': ['GGGTTCTTRGGARCAGCAGGAAG', rc('ATTGAGGCTTAAGCAGTGGGTTC')],} return self._primer_dict
def _rc(record): if isinstance(record, str): return rc(record) elif isinstance(record, Seq): return record.reverse_complement() elif isinstance(record, SeqRecord): return SeqRecord( record.seq.reverse_complement(), id=record.id, name=record.name, description=record.description ) else: raise ValueError( 'record must be one of str, Bio.Seq, or Bio.SeqRecord' )
def reads_to_seqrecord(reads): '''Build a FASTQ record out of BAM reads Note: copied from Bio.SeqIO.QualityIO.py ''' from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord # Precompute conversion table SANGER_SCORE_OFFSET = ord("!") q_mapping = dict() for letter in xrange(0, 255): q_mapping[chr(letter)] = letter - SANGER_SCORE_OFFSET seqs = [] for read in reads: # Get the sequence first descr = read.qname id = read.qname name = id from Bio.Alphabet import IUPAC from Bio.Alphabet import IUPAC from Bio.Seq import reverse_complement as rc if not read.is_reverse: record = SeqRecord(Seq(read.seq, IUPAC.ambiguous_dna), id=id, name=name, description=descr) if read.is_reverse: record = SeqRecord(Seq(rc(read.seq), IUPAC.ambiguous_dna), id=id, name=name, description=descr) # Get the qualities second qualities = [q_mapping[letter] for letter in read.qual] if qualities and (min(qualities) < 0 or max(qualities) > 93): raise ValueError("Invalid character in quality string") dict.__setitem__(record._per_letter_annotations, "phred_quality", qualities) seqs.append(record) return (seqs)
fragments_exons = {g: frs for (g, frs) in fragments_genes.iteritems() if g not in ('tat', 'rev')} fragments_exons['tat1'] = fragments_genes['tat'][0] fragments_exons['tat2'] = fragments_genes['tat'][1] fragments_exons['rev1'] = fragments_genes['rev'][0] fragments_exons['rev2'] = fragments_genes['rev'][1] fragments_RNA_structures = {'RRE': ['F5', 'F6'], "LTR5'": ['F1'], "LTR3'": ['F6']} fragments_other = {'env peptide': ['F4', 'F5'], 'psi': ['F1']} # Note: the reverse primers get reverse complemented (so everything is positive sense) primers_inner = {'F1': ['AAGTAGTGTGTGCCCGTCTGT', rc('TGCCAAAGAGTGATYTGAGGG')], 'F2': ['GGGCTGTTGGARATGTGG', rc('ACAAACTCCCAYTCAGGAATCCA')], 'F3a': ['GAAAGCATAGTRATATGGGGAAA', rc('CACCTGCCATCTGTTTTCCATA')], 'F3b': ['GAAAGCATAGTRATATGGGGAAA', rc('CACCTGCCATCTGTTTTCCATA')], 'F3B': ['GAAAGCATAGTRATATGGGGAAA', rc('CACCTGCCATCTGTTTTCCATA')], 'F4': ['TGGAAAGGTGAAGGGGCAG', rc('GTACACAGGCATGTGTRGCCCA')], 'F5a': ['TAAGAGAAAGAGCAGAAGACAGTGG', rc('CCAAATYCCYAGGAGCTGTTGATC')], 'F5b': ['TCTATTATGGRGTACCTGTRTGG', rc('CCAAATYCCYAGGAGCTGTTG')], 'F6': ['CAGGAAGCACTATGGGCGC', rc('CCAGAGAGCTCCCAGG')], } primers_outer = {'F1': ['CTCAATAAAGCTTGCCTTGAGTGC', rc('ACTGTATCATCTGCTCCTGTRTCT')], 'F2': ['AAATTGCAGGGCYCCTAG', rc('CTRTTAGCTGCCCCATCTACATAG')], 'F3a': ['CACACTAATGATGTAARACARTTAACAG', rc('TTCCATGTTYTAATCCTCATCCTGTCTAC')], # NOTE: F3b and F3B are actually the same, but I forgot about the last G for # the biggest part of the dataset. It's not a huge problem because that G is
# Edges of RNA structures RRE_edges = ['AGGAGCTATGTTCCTTGGGT', 'ACCTAAGGGATACACAGCTCCT'] LTR5 = [None, 'CTCTAGCA'] LTR3 = ['TGGANGGGNTANTTNNNTC', None] RNA_structure_edges = {'RRE': RRE_edges, "LTR5'": LTR5, "LTR3'": LTR3} # Edges of other regions env_peptide_edges = ['ATGAGAGTGAAGGAGAA', 'GCTCCTTGGGATGTTGATGATCTGTAGTGCT'] psi_element = ['CTCGGCTTGCT', 'AGCGGAGGCTAG'] # V1, V3, V4, and V5 actually start INSIDE these primers V1_edges = [ 'AANCCATGTGTAAAANTAACNCCACTNTGTGTNANTTTANAN', 'TGCTCTTTCAATNTCANCNCANNNNTAANA' ] V3_edges = ['ACAATGYACACATGGAATTARGCCA', rc('AGAAAAATTCYCCTCYACAATTAAA')] V4_edges = [ 'TTGTAANGCACANTTTTAATTGTGGAGGGGAATTTTTCTAC', 'AGAATAANACAAATTNTAAACANGTGGCAGNAAGTAGGA' ] V5_edges = [ 'ATCAAATATTACAGGGNTNNTAACAAGAGATGGNGGN', 'GNAGGAGGANATATGANGGANAATTGGAGAAGT' ] # V2 is particular: from the left it starts right there, from the right it ends # INSIDE this primer V2_edges = [ 'TGCTCTTTCAATNTCANCNCANNNNTAANA', 'AANACCTCANTCATTACACANGCNTGTCCAAANNTATCCTTTGANCCAATTCC' ] gp120_noVloops_edges = [
fragments_exons['tat2'] = fragments_genes['tat'][1] fragments_exons['rev1'] = fragments_genes['rev'][0] fragments_exons['rev2'] = fragments_genes['rev'][1] fragments_RNA_structures = { 'RRE': ['F5', 'F6'], "LTR5'": ['F1'], "LTR3'": ['F6'] } fragments_other = {'env peptide': ['F4', 'F5'], 'psi': ['F1']} # Note: the reverse primers get reverse complemented (so everything is positive sense) primers_inner = { 'F1': ['AAGTAGTGTGTGCCCGTCTGT', rc('TGCCAAAGAGTGATYTGAGGG')], 'F2': ['GGGCTGTTGGARATGTGG', rc('ACAAACTCCCAYTCAGGAATCCA')], 'F3a': ['GAAAGCATAGTRATATGGGGAAA', rc('CACCTGCCATCTGTTTTCCATA')], 'F3b': ['GAAAGCATAGTRATATGGGGAAA', rc('CACCTGCCATCTGTTTTCCATA')], 'F3B': ['GAAAGCATAGTRATATGGGGAAA', rc('CACCTGCCATCTGTTTTCCATA')], 'F4': ['TGGAAAGGTGAAGGGGCAG', rc('GTACACAGGCATGTGTRGCCCA')], 'F5a': ['TAAGAGAAAGAGCAGAAGACAGTGG', rc('CCAAATYCCYAGGAGCTGTTGATC')], 'F5b': ['TCTATTATGGRGTACCTGTRTGG', rc('CCAAATYCCYAGGAGCTGTTG')], 'F6': ['CAGGAAGCACTATGGGCGC',
RRE_edges = ['AGGAGCTATGTTCCTTGGGT', 'ACCTAAGGGATACACAGCTCCT'] LTR5 = [None, 'CTCTAGCA'] LTR3 = ['TGGANGGGNTANTTNNNTC', None] RNA_structure_edges = {'RRE': RRE_edges, "LTR5'": LTR5, "LTR3'": LTR3} # Edges of other regions env_peptide_edges = ['ATGAGAGTGAAGGAGAA', 'GCTCCTTGGGATGTTGATGATCTGTAGTGCT'] psi_element = ['CTCGGCTTGCT', 'AGCGGAGGCTAG'] # V1, V3, V4, and V5 actually start INSIDE these primers V1_edges = ['AANCCATGTGTAAAANTAACNCCACTNTGTGTNANTTTANAN', 'TGCTCTTTCAATNTCANCNCANNNNTAANA'] V3_edges = ['ACAATGYACACATGGAATTARGCCA', rc('AGAAAAATTCYCCTCYACAATTAAA')] V4_edges = ['TTGTAANGCACANTTTTAATTGTGGAGGGGAATTTTTCTAC', 'AGAATAANACAAATTNTAAACANGTGGCAGNAAGTAGGA'] V5_edges = ['ATCAAATATTACAGGGNTNNTAACAAGAGATGGNGGN', 'GNAGGAGGANATATGANGGANAATTGGAGAAGT'] # V2 is particular: from the left it starts right there, from the right it ends # INSIDE this primer V2_edges = ['TGCTCTTTCAATNTCANCNCANNNNTAANA', 'AANACCTCANTCATTACACANGCNTGTCCAAANNTATCCTTTGANCCAATTCC'] gp120_noVloops_edges = ['NNAGAANANTTGTGGGTCACAGTCTATTATGGGGTACCT', 'AAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAG', 'AANACCTCANTCATTACACANGCNTGTCCAAANNTATCCTTTGANCCAATTCC', 'ACAGTACAATGTACACATGGAATTANNCCA', 'TTTAATTGTGGAGGGGAATTTTTCT', 'GGGGAATTTTTCTAC', 'GAATAAAACAAATTNTAAACANGTGGCAGAAAGTAGGAAAAGCA', 'ATTACAGGGNTNNTATTAACAAGAGATGGTGGT',
def cseguid(seq): '''Returns the cSEGUID for the sequence. The cSEGUID is the url safe SEGUID checksum calculated for the lexicographically minimal string rotation of a DNA sequence. Only defined for circular sequences. ''' from Bio.Seq import reverse_complement as rc return pretty_string( seguid( min( SmallestRotation(str(seq).upper()), SmallestRotation(str(rc(seq)).upper()))))
def copy_features(source_sr, target_sr, limit = 10): '''This function tries to copy all features in source_seq and copy them to target_seq. Source_sr and target_sr are objects with a features property, such as Dseqrecord or Biopython SeqRecord. Parameters ---------- source_seq : SeqRecord or Dseqrecord The sequence to copy features from target_seq : SeqRecord or Dseqrecord The sequence to copy features to Returns ------- bool : True This function acts on target_seq in place. No data is returned. ''' import re from Bio.Seq import reverse_complement as rc target_length = len(target_sr) target_string = str(target_sr.seq).upper() try: circular = bool(target_sr.circular) except AttributeError: circular=False newfeatures=[] trgt_string = target_string trgt_string_rc = rc(trgt_string) for feature in [f for f in source_sr.features if len(f)>limit]: fsr = feature.extract(source_sr).upper() featurelength = 0# len(fsr) if circular: trgt_string = target_string+target_string[:featurelength] trgt_string_rc = rc(trgt_string) positions = ( [(m.start(), m.end(), 1,) for m in re.finditer(str(fsr.seq),trgt_string)] + [(len(trgt_string_rc)-m.end(),len(trgt_string_rc)-m.start(),-1,) for m in re.finditer(str(fsr.seq),trgt_string_rc)]) for begin, end, strand in positions: if circular and begin<target_length<end: end = end-len( target_sr) sf1 = SeqFeature(FeatureLocation(begin, trgt_length), type=feature.type, location_operator=feature.location_operator, strand=strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None,) sf2 = SeqFeature(FeatureLocation(0, end), type=feature.type, location_operator=feature.location_operator, strand=strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None,) nf = SeqFeature(FeatureLocation(begin, end), type=feature.type, location_operator="join", strand=strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=[sf1,sf2],) else: nf = SeqFeature(FeatureLocation(begin,end), type=feature.type, location_operator=feature.location_operator, strand=strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None) newfeatures.append(nf) target_sr.features.extend(newfeatures) return True
def filter_tendem(gen): for code in gen: if max_rep(code) > 2: continue else: yield code if __name__ == "__main__": codes = barcode_gen(8) codes = filter_tendem(codes) acc = KeepDist(3) codes = acc.filter(codes) codes = islice(codes, 96) #print("index\tbarcode\tlinker_F\tlinker_R") for i, code in enumerate(codes): a1 = "GTCGGA" + code + "G" a2 = rc(a1)[::-1] a1 = "TA" + a1 a2 = a2 + "GATC" a2 = a2[::-1] #items = [str(i), code, a1, a2] #print("\t".join(items)) if i < 10: idx = '0' + str(i) else: idx = str(i) print(f"MseI-linker-{idx}-F\t{a1}") print(f"MseI-linker-{idx}-R\t{a2}")
variants_in_gene = {} pb = ProgressBar(maxval=UnknownLength) for line in pb(open(variants)): chrom, pos, _, variant = line.split() pos = int(pos) for gene_chrom, low, high in gene_coords: if chrom == gene_chrom and low <= pos <= high: variant = variant.split('|') mel, sim = variant #assert chroms[chrom][pos] == 'N' chroms[chrom][pos] = ambigs[tuple(sorted(variant))] mel_copy[chrom][pos] = mel sim_copy[chrom][pos] = sim assert mel_copy[chrom][pos] != sim_copy[chrom][pos] if gene_strand == '-': variant = (rc(variant[0]), rc(variant[1])) variants_in_gene[pos] = variant break # Build the hunchback transcript gene_str = [] mel_str = [] sim_str = [] gene_bps = [] for chrom, start, stop in gene_coords: gene_str.append(str(chroms[chrom][start:stop])) mel_str.append(str(mel_copy[chrom][start:stop])) sim_str.append(str(sim_copy[chrom][start:stop])) gene_bps.extend(range(start, stop))