def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == "parse": nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == "index": # Deliberately using a fancy protein alphabet for testing: nucl = SeqIO.index(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], "clustal", alphabet=generic_protein) with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) nucl.close() # Close the indexed FASTA file elif i[1] == "id": nucl = SeqIO.parse(i[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], "clustal", alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = {i.split()[0]: i.split()[1] for i in handle} with warnings.catch_warnings(): warnings.simplefilter("ignore") caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) self.alns = alns
def main(): (opts, args) = getoptions() # Load PWMs pssms = load_motifs(opts.pwm_dir, opts.pseudocount) if opts.testseq is not None: if opts.seqtype == 'RNA': seq = Seq(opts.testseq, IUPAC.IUPACUnambiguousRNA()).back_transcribe() seq.alphabet = IUPAC.IUPACUnambiguousDNA() else: seq = Seq(opts.testseq, IUPAC.IUPACUnambiguousDNA()) final = scan_all(pssms, seq, opts) print final.to_csv(sep="\t", index=False) else: # Scan in sequence print >> sys.stderr, "Scanning sequences ", tic = time.time() for seqrecord in SeqIO.parse(open(args[0]), "fasta"): seq = seqrecord.seq if opts.seqtype == "RNA": seq = seq.back_transcribe() seq.alphabet = IUPAC.IUPACUnambiguousDNA() final = scan_all(pssms, seq, opts) print final.to_csv(sep="\t", index=False) toc = time.time() print >> sys.stderr, "done in %0.2f seconds!" % (float(toc - tic))
def get_seq_record(record, start, stop, description): """Return a SeqRecord for query between start and stop. Given a sam record, find query sequences that cover the [start, stop] interval completely and create a SeqRecord object with sequence in the reference orientation. """ # get the query positions of the bases mapped to start, stop # currently only extracts from reads where both endpoints are mapped positions = record.get_aligned_pairs(matches_only=True) first_position = [item[0] for item in positions if item[1] == start] last_position = [item[0] for item in positions if item[1] == stop] # fetch and reorient sequence if first_position and last_position: name = record.query_name if not record.is_reverse: seq = Seq( record.get_forward_sequence() [first_position[0]:last_position[0]], IUPAC.IUPACUnambiguousDNA()) direction = 'f' else: length = record.query_length seq = Seq( record.get_forward_sequence()[length - last_position[0]:length - first_position[0]], IUPAC.IUPACUnambiguousDNA()).reverse_complement() direction = 'rc' return SeqRecord(seq, id=name, description=('|').join( [description, direction[record.is_reverse]]))
def chgAlpha(self, newAlpha): """Accepts 'DNA' 'RNA' or 'protein' or an alphabet object""" from Bio.Seq import Seq from Bio.Alphabet import IUPAC alpha = None if newAlpha == "DNA": alpha = IUPAC.IUPACUnambiguousDNA() self.typ = alpha elif newAlpha == "RNA": alpha = IUPAC.IUPACUnambiguousDNA() self.typ = alpha elif newAlpha == "protein": alpha = IUPAC.IUPACProtein() self.typ = alpha else: raise NameError, "type not 'DNA', 'RNA', or 'protein'" if not alpha: alpha = newAlpha self.seq = Seq(self.seq.tostring(), alpha) self.checkAlpha()
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': # Deliberately using a fancy protein alphabet for testing: nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=Gapped(IUPAC.ExtendedIUPACProtein())) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def setUp(self): # Test set 1 seq1 = SeqRecord(Seq( 'TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1') seq2 = SeqRecord(Seq( 'TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein), id='pro1') pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein), id='pro2') aln1 = MultipleSeqAlignment([pro1, pro2]) self.aln1 = aln1 self.seqlist1 = [seq1, seq2] # Test set 2 # M K K H E L(F)L C Q G T S N K L T Q(L)L G T F E D H F L S L Q R M F N N C E V V seq3 = SeqRecord(Seq( 'ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1') # seq4 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAA TGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') seq4 = SeqRecord(Seq( 'ATGAAAAAGCACGAGTTCTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAATGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') # seq5 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCC TTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3') seq5 = SeqRecord(Seq( 'ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3') pro3 = SeqRecord(Seq( 'MKKHELLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro1') pro4 = SeqRecord(Seq( 'MKKHEFLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro2') pro5 = SeqRecord(Seq( 'MKKHELLCQGTSNKLTLLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro3') aln2 = MultipleSeqAlignment([pro3, pro4, pro5]) self.aln2 = aln2 self.seqlist2 = [seq3, seq4, seq5]
def sequence(ungapped, position, length): """ Given an ungapped sequence and a positive or negative number (position), return the nucleotide at that position plus [length] nucleotides in the positive direction. """ if position >= 0: return str( Seq(str(ungapped.seq), IUPAC.IUPACUnambiguousDNA())[position:position + length]) else: return getNegative( str( Seq(str(ungapped.seq), IUPAC.IUPACUnambiguousDNA())[position:position + length]))
def setUp(self): nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein) with open(TEST_ALIGN_FILE6[0][2]) as handle: id_corr = dict((i.split()[0], i.split()[1]) for i in handle) aln = CodonAlign.build(prot, nucl, corr_dict=id_corr, alphabet=CodonAlign.default_codon_alphabet) self.aln = aln
def gb(self): l = self.length() g = SeqRecord( Seq(self.sequence(),IUPAC.IUPACUnambiguousDNA()), id=self.name[0:8], name=self.name[0:8], description=self.description ) g.features = [] for f in self.features(): t = f.type if f.direction == 'f': strand = 1 else: strand = -1 if self.shape == 'c' and f.end > l: f1 = FeatureLocation(ExactPosition(f.start), ExactPosition(l), strand) f2 = FeatureLocation(ExactPosition(0), ExactPosition(f.end - l), strand) if strand == 1: floc = CompoundLocation([f1, f2]) else: floc = CompoundLocation([f2, f1]) else: floc = FeatureLocation(ExactPosition(f.start),ExactPosition(f.end), strand) sf = SeqFeature(floc, f.type, qualifiers=dict([[q.name,q.data] for q in f.qualifiers.all()])) g.features.append(sf) return g.format('genbank')
def _guess_consensus_alphabet(self, ambiguous): """Pick an (ungapped) alphabet for an alignment consesus sequence (PRIVATE). This just looks at the sequences we have, checks their type, and returns as appropriate type which seems to make sense with the sequences we've got. """ # Start with the (un-gapped version of) the alignment alphabet a = Alphabet._get_base_alphabet(self.alignment._alphabet) # Now check its compatible with all the rest of the sequences for record in self.alignment: # Get the (un-gapped version of) the sequence's alphabet alt = Alphabet._get_base_alphabet(record.seq.alphabet) if not isinstance(alt, a.__class__): raise ValueError( "Alignment contains a sequence with an incompatible alphabet." ) # Check the ambiguous character we are going to use in the consensus # is in the alphabet's list of valid letters (if defined). if ( hasattr(a, "letters") and a.letters is not None and ambiguous not in a.letters ): # We'll need to pick a more generic alphabet... if isinstance(a, IUPAC.IUPACUnambiguousDNA): if ambiguous in IUPAC.IUPACUnambiguousDNA().letters: a = IUPAC.IUPACUnambiguousDNA() else: a = Alphabet.generic_dna elif isinstance(a, IUPAC.IUPACUnambiguousRNA): if ambiguous in IUPAC.IUPACUnambiguousRNA().letters: a = IUPAC.IUPACUnambiguousRNA() else: a = Alphabet.generic_rna elif isinstance(a, IUPAC.IUPACProtein): if ambiguous in IUPAC.ExtendedIUPACProtein().letters: a = IUPAC.ExtendedIUPACProtein() else: a = Alphabet.generic_protein else: a = Alphabet.single_letter_alphabet return a
def setUp(self): nucl = SeqIO.parse(TEST_ALIGN_FILE6[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(TEST_ALIGN_FILE6[0][1], 'clustal', alphabet=IUPAC.protein) with open(TEST_ALIGN_FILE6[0][2]) as handle: id_corr = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonWarning) aln = codonalign.build(prot, nucl, corr_dict=id_corr, alphabet=codonalign.default_codon_alphabet) self.aln = aln
def setUp(self): self.aln_file = [ TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6 ] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = CodonAlign.build( prot, nucl, alphabet=CodonAlign.default_codon_alphabet) elif i[1] == 'index': nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = CodonAlign.build( prot, nucl, alphabet=CodonAlign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) id = dict((i.split()[0], i.split()[1]) for i in open(i[0][2]).readlines()) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = CodonAlign.build( prot, nucl, corr_dict=id, alphabet=CodonAlign.default_codon_alphabet) alns.append(caln) self.alns = alns
def cast_sequence(ungapped_sequence): """ ungapped_sequence: a list with the sequence and id for all the species in a file. Returns a list sequences with the type cast as c. """ casted = [] for record in ungapped_sequence: casted.append(Seq(str(record.seq), IUPAC.IUPACUnambiguousDNA())) return casted
def gb(self): g = SeqRecord( Seq(self.sequence(),IUPAC.IUPACUnambiguousDNA()), id=self.name[0:8], name=self.name[0:8], description=self.description ) g.features = [SeqFeature( FeatureLocation(ExactPosition(f.start-1),ExactPosition(f.end)), f.type, qualifiers=dict([[q.name,q.data] for q in f.qualifiers.all()])) for f in self.features()] return g.format('genbank')
def pwm_scan(self, left=0, right=0): records = SeqIO.index(self.genome, 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) print(self.motif.consensus) print(self.motif.pssm) print(self.motif) for gene, peaks in self._peaks.items(): for peak in peaks: chrom = str(peak.chrom).replace('chr', '') seq = records[chrom][int(peak.start) - left:int(peak.end) + right].seq matches = list(self.motif.pssm.search(seq)) print("Gene: " + str(gene) + ", height: " + str(peak.enrichment) + ", sites: " + str(len(matches))) if matches: print(matches)
def test_mk(self): p = SeqIO.index(TEST_ALIGN_FILE7[0][0], "fasta", alphabet=IUPAC.IUPACUnambiguousDNA()) pro_aln = AlignIO.read(TEST_ALIGN_FILE7[0][1], "clustal", alphabet=IUPAC.protein) codon_aln = codonalign.build(pro_aln, p) p.close() # Close indexed FASTA file self.assertAlmostEqual(codonalign.mktest( [codon_aln[1:12], codon_aln[12:16], codon_aln[16:]]), 0.0021, places=4)
def tntable(table=11): codons = itertools.product('ACTG', repeat=3) _table = {} translation = {} for x in [''.join(y) for y in codons]: aa = str(Seq( x, IUPAC.IUPACUnambiguousDNA()).translate(table=table)).upper() translation[x] = aa try: _table[aa][x] = {} except: _table[aa] = {x: {}} return _table, translation
def setUp(self): # Test set 1 seq1 = SeqRecord(Seq('TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1') seq2 = SeqRecord(Seq('TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') pro1 = SeqRecord(Seq('SGTARTKLLLLLAALCAAGGALE', alphabet=IUPAC.protein), id='pro1') pro2 = SeqRecord(Seq('SGTSRTKRLLLLAALGAAGGALE', alphabet=IUPAC.protein), id='pro2') aln1 = MultipleSeqAlignment([pro1, pro2]) self.aln1 = aln1 self.seqlist1 = [seq1, seq2] # Test set 2 # M K K H E L(F)L C Q G T S N K L T Q(L)L G T F E D H F L S L Q R M F N N C E V V seq3 = SeqRecord(Seq('ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro1') # seq4 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAA TGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') seq4 = SeqRecord(Seq('ATGAAAAAGCACGAGTTCTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAATGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro2') # seq5 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCC TTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3') seq5 = SeqRecord(Seq('ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro3') pro3 = SeqRecord(Seq('MKKHELLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro1') pro4 = SeqRecord(Seq('MKKHEFLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro2') pro5 = SeqRecord(Seq('MKKHELLCQGTSNKLTLLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL', alphabet=IUPAC.protein), id='pro3') aln2 = MultipleSeqAlignment([pro3, pro4, pro5]) self.aln2 = aln2 self.seqlist2 = [seq3, seq4, seq5] # Test set 3 # use Yeast mitochondrial codon table seq6 = SeqRecord(Seq('ATGGCAAGGGACCACCCAGTTGGGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACCTTTCTTTTCTCAAGACCATCCAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro6') seq7 = SeqRecord(Seq('ATGGCAAGGCACCATCCAGTTGAGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACGTGTCTCTGCTCAAGACCATCCAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro7') seq8 = SeqRecord(Seq('ATGGCAGGGGACCACCCAGTTGGGCACTGATATGATCGTGTGTATCTGCAGAGTAGTAACCACTCTTTTCTCATGACCATCCAG', alphabet=IUPAC.IUPACUnambiguousDNA()), id='pro8') pro6 = SeqRecord(Seq('MARDHPVGHWYDRVYLQSSNTSFTKTIQ', alphabet=IUPAC.protein), id='pro6') pro7 = SeqRecord(Seq('MARHHPVEHWYDRVYLQSSNVSTTKTIQ', alphabet=IUPAC.protein), id='pro7') pro8 = SeqRecord(Seq('MAGDHPVGHWYDRVYTQSSNHSFTMTIQ', alphabet=IUPAC.protein), id='pro8') aln3 = MultipleSeqAlignment([pro6, pro7, pro8]) self.aln3 = aln3 self.seqlist3 = [seq6, seq7, seq8] self.codontable3 = CodonTable.unambiguous_dna_by_id[3]
def get_seq_record(record, start, stop, description): """Return a SeqRecord for query between start and stop. Given a sam record, find query sequences that cover the [start, stop] interval completely and create a SeqRecord object. """ # get the query positions of the bases mapped to start, stop # TODO: handle cases where no base is mapped (None) positions = record.get_aligned_pairs(matches_only=True) first_position = [item[0] for item in positions if item[1] == start] last_position = [item[0] for item in positions if item[1] == stop] # fetch sequence and qual if first_position and last_position: name = record.query_name if not record.is_reverse: seq = Seq( record.get_forward_sequence() [first_position[0]:last_position[0]], IUPAC.IUPACUnambiguousDNA()) qual = list(record.get_forward_qualities() )[first_position[0]:last_position[0]] else: length = record.query_length seq = Seq( record.get_forward_sequence()[length - last_position[0]:length - first_position[0]], IUPAC.IUPACUnambiguousDNA()) qual = list( record.get_forward_qualities())[length - last_position[0]:length - first_position[0]] rec = SeqRecord(seq, id=name, description=('|').join( [description, direction[record.is_reverse]])) rec.letter_annotations['phred_quality'] = qual return rec
def filter_genes(input_fasta, reference_hash): bad_genes = set() in_fasta = pyfasta.Fasta(input_fasta) for keys in in_fasta.keys(): tmp_gene = str(in_fasta[keys]).upper() try: tmp_gene = analysis_functions_introgressions.extend_ambiguous_dna( tmp_gene) except: print(tmp_gene) sys.exit(1) ref_gene = reference_hash[keys] ref_gene_translate = SeqRecord(Seq.Seq( str(ref_gene).replace("-", ""), alphabet=IUPAC.IUPACUnambiguousDNA()), id="REF").seq.translate(to_stop=True) if ((len(ref_gene_translate) + 1) != len(ref_gene) / 3): bad_genes.add(keys) for options in (tmp_gene): tmp_gene_translate = SeqRecord( Seq.Seq(str(options).replace("-", ""), alphabet=IUPAC.IUPACUnambiguousDNA()), id="REF").seq.translate(to_stop=True) if ("strict" in input_fasta): if ((len(tmp_gene_translate) + 1) != len(options) / 3): bad_genes.add(keys) else: if ((len(tmp_gene_translate) + 1) != int(len(options)) / 3): bad_genes.add(keys) if (len(tmp_gene_translate) <= len(ref_gene_translate) * .9 or len(tmp_gene_translate) * .9 >= len(ref_gene_translate)): bad_genes.add(keys) with open(input_fasta + ".filt_genes", "w") as output_filt: for bad_gene in bad_genes: output_filt.write(bad_gene + "\n")
def pwm2pssm(file, pseudocount): """ Convert load PWM and covernt it to PSSM (take the log_odds) """ pwm = pd.read_table(file) # Assuming we are doing RNA motif scanning. Need to replace U with T # as Biopython's motif scanner only does DNA pwm.rename(columns={'U': 'T'}, inplace=True) pwm = pwm.drop("Pos", 1).to_dict(orient='list') pwm = motifs.Motif(alphabet=IUPAC.IUPACUnambiguousDNA(), counts=pwm) pwm = pwm.counts.normalize(pseudocount) # Can optionally add background, but for now assuming uniform probability pssm = pwm.log_odds() # Replace negative infinity values with very low number #for letter, odds in pssm.iteritems(): #pssm[letter] = [-10**6 if x == -float("inf") else x for x in odds] return (pssm)
def get_sequences_from_fasta(fasta_string, limit=None, return_record=False, dna_alphabet=False): """parses a FASTA file of sequences, and returns each contained sequence Args: fasta_string: the contents of the FASTA file limit: how many sequences to parse return_record: whether to return the sequence as a string, or a biopython record object dna_alphabet: whether to force to usage of the DNA alphabet (IUPAC.IUPACUnambiguousDNA) Returns: a list of sequences, in the intended format """ seq_io = StringIO(fasta_string) sequences_parsed = None if dna_alphabet: sequences_parsed = SeqIO.parse(seq_io, 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) else: sequences_parsed = SeqIO.parse(seq_io, 'fasta') sequences = [] for i, fasta in enumerate(sequences_parsed): # get sequence, and convert to uppercase sequence = str(fasta.seq).upper() seq_id = str(fasta.id) # make sure the sequence is only A,C,G,T/U # since we are only working with RNA sequences, we can assume this must be true for c in sequence: if not(c in ['A', 'C', 'G', 'T', 'U']): raise Exception('A sequence contains invalid character: %s' % c) if return_record: sequences.append(fasta) else: sequences.append((seq_id, sequence)) if limit: if i >= limit-1: break return sequences
def _load_pwms(self): """Loads and returns position weight matrices. Returns: a dictionary of pwms, where the key is the CISBP id code """ pwms = {} dir_path = os.path.dirname(os.path.realpath(__file__)) for file in glob.glob(dir_path + "/data/cisbp_rna/pwms/*.txt"): pwm_id = os.path.splitext(os.path.basename(file))[0] try: pwm = pd.read_csv(file, sep="\t", header=0, index_col=0) # biopython can only handle DNA motifs so we replace U with T pwm.rename(columns = {"U":"T"}, inplace=True) pwm = motifs.Motif(alphabet=IUPAC.IUPACUnambiguousDNA(), counts=pwm.to_dict(orient="list")) pwm = pwm.counts.normalize(pseudocounts=0.00001) pwms[pwm_id] = pwm except: # some pwm files are empty - we skip these continue return pwms
def test_mk(self): ver = sys.version_info if ver[0] == 2 and ver[1] == 6: warnings.warn('Python 2.6 detected. Skip testing MK method') pass else: from run_tests import is_numpy if is_numpy(): p = SeqIO.index(TEST_ALIGN_FILE7[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) pro_aln = AlignIO.read(TEST_ALIGN_FILE7[0][1], 'clustal', alphabet=IUPAC.protein) codon_aln = CodonAlign.build(pro_aln, p) self.assertAlmostEquals(round( CodonAlign.mktest( [codon_aln[1:12], codon_aln[12:16], codon_aln[16:]]), 4), 0.0021, places=4) else: warnings.warn('Numpy not installed. Skip MK test.')
def read_tag_file(self, tag_file_name): """Process the tags file. Process the tag file, which has the format: TagName ForwardTag. Read the tag file, and store a dictionary that contains the tagname and a pair containing the forward and reverse tag, for all samples. Parameters ---------- tag_file_name : string input tag file name - one sample per line, with format: TagName ForwardTag Raises ------ IOError when input file format is incorrect. """ tag_file = open(tag_file_name) for line in tag_file: line = line.strip() if len(line) == 0: continue tokens = line.split() if len(tokens) != 2: self.logger.error("Line does not have the correct format.") raise IOError("Line:" + line + "\ndoes not have the correct format.") if tokens[0] in self._tag_dict: self.logger.error("Repeat tag name: " + tokens[0]) raise IOError(tokens[0] + " already present in file.") forwardTag = Seq(tokens[1], IUPAC.IUPACUnambiguousDNA()) self._tag_dict[tokens[0]] = forwardTag self.logger.info("Read " + str(len(self._tag_dict)) + " valid tag \ combinations.") tag_file.close()
def __init__(self, _number, _seq): self.number = _number self.seq = Seq(_seq, IUPAC.IUPACUnambiguousDNA()) self.rseq = self.seq.complement() self.features = []
# Used later when marking output file alignment_file_name = os.path.basename(sys.argv[1]) motif_file_name = os.path.basename(sys.argv[2]) print ("alignment file: " + alignment_file_name) print ("motif file: " + motif_file_name) raw_sequences = [] for record in alignment: raw_sequences.append(SeqRecord(record.seq.ungap("-"), id = record.id)) ## make raw sequences all IUPAC.IUPACUnambiguousDNA() raw_sequences_2 = [] for seq in raw_sequences: raw_sequences_2.append(Seq(str(seq.seq), IUPAC.IUPACUnambiguousDNA())) ##################### ## Motifs ##################### pwm = motif.counts.normalize(pseudocounts=0.0) # Doesn't change from pwm pssm = pwm.log_odds() motif_length = len(motif) #for later retrival of nucleotide sequence ###################### ## Searching for Motifs in Sequences ###################### ## Returns a list of arrays with a score for each position ## This give the score for each position
seqs = [] header = None for seq_record in SeqIO.parse(fastafile, "fasta"): seq_record.seq.alphabet = IUPAC.unambiguous_dna seqs.append(seq_record) return seqs fasta = ReadFASTA(args.file[0]) regions = pd.read_csv( args.csv[0], converters={"coding": literal_eval}, index_col="chr" ) # the chromosomes are the indexes here - that's fine they should be unique # cycle through fasta filte OR = [] for gene in fasta: seg = gene.id coding_regions = regions.loc[seg, "coding"] open_reading = copy.deepcopy(gene) open_reading.seq = Seq('', alphabet=IUPAC.IUPACUnambiguousDNA()) for splice in coding_regions: open_reading.seq = open_reading.seq + gene.seq[ splice[0] - 1:splice[1]] # for python numbering and slicing OR.append(open_reading) output_handle = open(args.out_fa[0], "w") SeqIO.write(OR, output_handle, "fasta") output_handle.close()
def divergence(): ######################## ## Arguments d'entrée ## ######################## fic1dna = sys.argv[1] #fichier des séquences adn de l'espèce 1 fic2dna = sys.argv[2] #fichier des séquences adn de l'espèce 2 fic1prot = sys.argv[3] #fichier des séquences protéiques de l'espèce 1 fic2prot = sys.argv[4] #fichier des séquences protéiques de l'espèce 2 #outfile_unaligned="outfile_unaligned.fa" #outfile_unaligned=open(outfile_unaligned,"w",encoding='utf-8') outfile_dn_ds = sys.argv[5] #fichier de sortie format tableau, sep = ";" outfile_dn_ds = open(outfile_dn_ds, "w", encoding='utf-8') method = sys.argv[6] #Methode utilisée muscle_exe = sys.argv[7] #Chemin vers le fichier executable de MUSCLE #Transformation des séquences en format SeqIO seq1dna = list( SeqIO.parse(fic1dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())) seq2dna = list( SeqIO.parse(fic2dna, "fasta", alphabet=IUPAC.IUPACUnambiguousDNA())) seq1prot = list(SeqIO.parse(fic1prot, "fasta", alphabet=IUPAC.protein)) seq2prot = list(SeqIO.parse(fic2prot, "fasta", alphabet=IUPAC.protein)) #Première ligne du tableau "titres" """print("seq.id",";","dN",";","dS",";","Dist_third_pos",";","Dist_brute",";","Length_seq_1",";","Length_seq2", ";","GC_content_seq1",";","GC_content_seq2",";","GC",";","Mean_length",file=outfile_dn_ds)""" print("Nombre de paires de sequences a analyser: ", len(seq1dna)) print("seq.id", ";", "dN", ";", "dS", ";", "Dist_third_pos", ";", "Dist_brute", ";", "Length_seq_1", ";", "Length_seq2", ";", "GC_content_seq1", ";", "GC_content_seq2", ";", "GC", ";", "Mean_length") """df2 = pd.DataFrame(columns=("seq.id","dN","dS","Dist_third_pos","Dist_brute","Length_seq_1","Length_seq2", "GC_content_seq1","GC_content_seq2","GC","Mean_length"))""" #Boucle sur chaque paire de séquence u = 0 while u < (len(seq1dna)): try: ########################################################### #. Alignement entre chaque paire de séquence # ########################################################### nuc1 = str(seq1dna[u].seq ) #Récupère la séquence u et la transforme en string nuc2 = str(seq2dna[u].seq) prot1 = str(seq1prot[u].seq) prot2 = str(seq2prot[u].seq) protein2 = SeqRecord( Seq(prot2, alphabet=IUPAC.protein), id='protein2' ) #Transformation de la séquence protéique en format SeqRecord protein1 = SeqRecord(Seq(prot1, alphabet=IUPAC.protein), id='protein1') with open( "outfile_unaligned.fa", "w", encoding='utf-8' ) as output_handle: #Permet de créer un fichier de deux séquences non-alignées (format fasta) SeqIO.write(protein1, output_handle, "fasta") SeqIO.write(protein2, output_handle, "fasta") muscle_cline = MuscleCommandline( muscle_exe, input="outfile_unaligned.fa", out="outfile_aligned.aln" ) #Prend en entrée le fichier de séquences non-alignées et sort un fichier de séquences alignées stdout, stderr = muscle_cline() alns = AlignIO.read( "outfile_aligned.aln", "fasta") #Lecture du fichier de séquences alignées prot1 = str(alns[0].seq) #Récupère la séquence protéique 1 alignée prot2 = str(alns[1].seq) #Récup§re la séquence protéique 2 alignée nuc2 = SeqRecord( Seq(nuc2, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc2' ) #Transformation de la séquence nucléique en format SeqRecord nuc1 = SeqRecord(Seq(nuc1, alphabet=IUPAC.IUPACUnambiguousDNA()), id='nuc1') prot1 = SeqRecord( Seq(prot1, alphabet=IUPAC.protein), id='pro1' ) #Transformation de la séquence protéique en format SeqRecord prot2 = SeqRecord(Seq(prot2, alphabet=IUPAC.protein), id='pro2') aln = MultipleSeqAlignment( [prot1, prot2] ) #Créer format alignement des 2 séquences protéiques préalablement alignées codon_aln = codonalign.build( aln, [nuc1, nuc2]) #Créer un alignement de codon #Fichier d'alignement #AlignIO.write(codon_aln,"outfile_aligned", 'fasta') lengthseq1 = len(nuc1.seq) lengthseq2 = len(nuc2.seq) GCcontentseq1 = GC(nuc1.seq) GCcontentseq2 = GC(nuc2.seq) GC_mean = ((GCcontentseq1 + GCcontentseq2) / 2) if lengthseq1 >= lengthseq2: Min_length = lengthseq2 if lengthseq1 < lengthseq2: Min_length = lengthseq1 ########################################################## # CALCULS DES INDICES DE DIVERGENCE # ########################################################## #Calcul de divergence synonyme et non-synonyme #Supression des gaps seq1 = "" seq2 = "" for x, z in zip(codon_aln[0], codon_aln[1]): if z == "-": continue if x == "-": continue else: seq1 += x seq2 += z ################################################################# #. Comptage du nombre de site polymorhe brute # ################################################################# #Compteur de différences par site compteur0 = 0 for i, e in zip(seq1, seq2): if i != e: compteur0 += 1 distance_brute = round(float((compteur0) / len(seq1)), 3) seq1_third_pos = "" seq2_third_pos = "" compteur1 = 0 for i in seq1[2::3]: if i.isalpha(): seq1_third_pos += i compteur1 += 1 compteur2 = 0 for i in seq2[2::3]: if i.isalpha(): seq2_third_pos += i compteur2 += 1 #################################################################### # Comptage du nombre de site polymorphe en troisième position # #################################################################### #Compteur de différences par site (3ieme position) compteur3 = 0 for i, e in zip(seq1_third_pos, seq2_third_pos): if i != e: compteur3 += 1 distance_third_pos = round(float((compteur3) / compteur2), 3) #################################################################### # Calcul dN et dS selon la méthode utilisée # #################################################################### try: dN, dS = cal_dn_ds(codon_aln[0], codon_aln[1], method=method) """print(seq1dna[u].id,";",dN,";",dS,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", dN, ";", dS, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":dN,"dS":dS,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except ValueError: result = 9.999 #Saturation trop importante pour calculer les indices. """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except ZeroDivisionError: result = 9.999 """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" except KeyError: result = 9.999 """print(seq1dna[u].id,";",result,";",result,";",distance_third_pos,";",distance_brute,";",lengthseq1, ";",lengthseq2,";",GCcontentseq1,";",GCcontentseq2,";",GC_mean,";",Min_length,file=outfile_dn_ds)""" print(seq1dna[u].id, ";", result, ";", result, ";", distance_third_pos, ";", distance_brute, ";", lengthseq1, ";", lengthseq2, ";", GCcontentseq1, ";", GCcontentseq2, ";", GC_mean, ";", Min_length) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":result,"dS":result,"Dist_third_pos":distance_third_pos,"Dist_brute":distance_brute,"Length_seq_1":lengthseq1, "Length_seq2":lengthseq2,"GC_content_seq1":GCcontentseq1,"GC_content_seq2":GCcontentseq2,"GC":GC_mean,"Mean_length":Min_length}, ignore_index=True)""" u += 1 except: traceback.print_exc() print("Une erreur est survenue pour la sequence: ", seq1dna[u].id, "vs", seq2dna[u].id) """df2=df2.append({"seq.id":seq1dna[u].id,"dN":"NA","dS":"NA","Dist_third_pos":"NA","Dist_brute":"NA","Length_seq_1":"NA", "Length_seq2":"NA","GC_content_seq1":"NA","GC_content_seq2":"NA","GC":"NA","Mean_length":"NA"}, ignore_index=True)""" u += 1 #df2.to_csv(outfile_dn_ds, sep='\t') outfile_dn_ds.close() #Fermeture du fichier ouvert
} #Lists necessary for converting ambiguous nucleotides sequence5f = list() #5' cutting point sequence3f = list() #3' end cutting point sequence5c = list() #5' complementary cutting point #Creating the 5' and 3' cutting points for nucleotide in sequence5: sequence5f.append( DIUPAC[nucleotide]) #Changes the nucleotide key for its value sequence5f = "".join(sequence5f) #Joins the list into a string for nucleotide in sequence3: sequence3f.append(DIUPAC[nucleotide]) sequence3f = "".join(sequence3f) #Obtaining complementary of 5' target sequence (cutting point) to be able to check whether a sequence is the forward or the reverse check = Seq(sequence5, IUPAC.IUPACUnambiguousDNA()) checkseq = check.reverse_complement() for nucleotide in checkseq: sequence5c.append(DIUPAC[nucleotide]) sequence5c = "".join(sequence5c) log.write( "Target string used to cut 5' end: %s\nTarget string used to cut 3' end: %s\n\n" % (sequence5f, sequence3f)) Preverse = list() #Necessary for cutting Cutfiles = list() def cutfile(filename, sequence5f, sequence3f ): #Function to cut the fastq files in the indicated points global cut #The variable can also be used outside the function global uncut