def directStringSeq(): my_string = "GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG" Compl = complement(my_string) reCompl = reverse_complement(my_string) transc = transcribe(my_string) bTransc = back_transcribe(my_string) transl = translate(my_string) print('my_string = ', my_string) print('Compl = ', Compl) print('reCompl = ', reCompl) print('transc = ', transc) print('bTransc = ', bTransc) print('transl = ', transl)
def get_gc_coverage(fasta, binwidth, output): half_width = (binwidth-1)//2 fasta_dict = {record.id:record.seq for record in SeqIO.parse(fasta, "fasta")} header = [(chrom, len(seq)) for chrom, seq in fasta_dict.items()] with pybw.open(output, 'w') as outbw: outbw.addHeader(header) for chrom, sequence in fasta_dict.items(): #pad the ends of the chromosome with mirrored sequence padded = complement(reverse_complement(sequence[0:half_width])) + sequence + complement(reverse_complement(sequence[-half_width::])) #should really vectorize this gc_vector = np.zeros(len(sequence)) for i in range(len(sequence)): gc_vector[i] = GC(padded[i:i+binwidth]) outbw.addEntries(chrom, 0, values=gc_vector, span=1, step=1)
def drawnucleotides(self): self.calculatefontsize() if self.display_sequence: sequence = self.genome_window.genome.seq[ self.genome_window. window_left:self.genome_window.window_right + 1] if not self.genome_window.top_positive: sequence = sequence.reverse_complement() for x, nt, nt_comp in zip(self.genome_window.x_array, sequence, complement(sequence)): text = self.ax.text(x, 0, nt, ha='center', va='bottom', fontsize=self.fontsize - self.font_spacer, color=self.color_dictionary[nt], **self.font_kwargs)
def add_nucl(S_1, S_2, p_number_seq, count_nucl): """ Adds nucleotides until the linking happens :param S_1: the first sequence :param S_2: the second sequence :param p_number_seq: the probability that nucleotides attach to the first strand :param count_nucl: the number of nucleotides that enough for linkage :return: linked sequence """ version_for_1 = {} version_for_2 = {} for i in range(len(S_1) - count_nucl + 1): version_for_1.setdefault(S_1[i:(i + count_nucl)], []).append(i) version_for_2.setdefault(S_2[i:(i + count_nucl)], []).append(len(S_1) - i - count_nucl) while (complement(S_1[-count_nucl:]) not in version_for_2) and (complement( S_2[:count_nucl]) not in version_for_1): if random() < p_number_seq: S_1 = S_1 + choices(['A', 'C', 'T', 'G'], weights=[A_weig, C_weig, T_weig, G_weig])[0] version_for_1.setdefault(S_1[-count_nucl:], []).append(len(S_1) - count_nucl) else: S_2 = choices(['A', 'C', 'T', 'G'], weights=[A_weig, C_weig, T_weig, G_weig])[0] + S_2 version_for_2.setdefault(S_2[:count_nucl], []).append(len(S_2) - count_nucl) if complement(S_1[-count_nucl:]) in version_for_2: index = choice(version_for_2.get(complement(S_1[-count_nucl:]))) S = S_1 + complement(S_2[(-index - 1):]) else: index = choice(version_for_1.get(complement(S_2[:count_nucl]))) S = S_1[:index] + complement(S_2) return S
def molecular_weight( seq, seq_type="DNA", double_stranded=False, circular=False, monoisotopic=False ): """Calculate the molecular mass of DNA, RNA or protein sequences as float. Only unambiguous letters are allowed. Nucleotide sequences are assumed to have a 5' phosphate. Arguments: - seq: String or Biopython sequence object. - seq_type: The default is to assume DNA; override this with a string "DNA", "RNA", or "protein". - double_stranded: Calculate the mass for the double stranded molecule? - circular: Is the molecule circular (has no ends)? - monoisotopic: Use the monoisotopic mass tables? >>> print("%0.2f" % molecular_weight("AGC")) 949.61 >>> print("%0.2f" % molecular_weight(Seq("AGC"))) 949.61 However, it is better to be explicit - for example with strings: >>> print("%0.2f" % molecular_weight("AGC", "DNA")) 949.61 >>> print("%0.2f" % molecular_weight("AGC", "RNA")) 997.61 >>> print("%0.2f" % molecular_weight("AGC", "protein")) 249.29 """ # Rewritten by Markus Piotrowski, 2014 seq = "".join(str(seq).split()).upper() # Do the minimum formatting if seq_type == "DNA": if monoisotopic: weight_table = IUPACData.monoisotopic_unambiguous_dna_weights else: weight_table = IUPACData.unambiguous_dna_weights elif seq_type == "RNA": if monoisotopic: weight_table = IUPACData.monoisotopic_unambiguous_rna_weights else: weight_table = IUPACData.unambiguous_rna_weights elif seq_type == "protein": if monoisotopic: weight_table = IUPACData.monoisotopic_protein_weights else: weight_table = IUPACData.protein_weights else: raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" % seq_type) if monoisotopic: water = 18.010565 else: water = 18.0153 try: weight = sum(weight_table[x] for x in seq) - (len(seq) - 1) * water if circular: weight -= water except KeyError as e: raise ValueError( "%s is not a valid unambiguous letter for %s" % (e, seq_type) ) from None if double_stranded: if seq_type == "protein": raise ValueError("protein sequences cannot be double-stranded") elif seq_type == "DNA": seq = complement(seq, inplace=False) # TODO: remove inplace=False elif seq_type == "RNA": seq = complement_rna(seq) weight += sum(weight_table[x] for x in seq) - (len(seq) - 1) * water if circular: weight -= water return weight