def summary(msa, output, title): ''' Ala (A) 9.10 Gln (Q) 3.79 Leu (L) 9.87 Ser (S) 6.69 Arg (R) 5.71 Glu (E) 6.16 Lys (K) 4.99 Thr (T) 5.57 Asn (N) 3.88 Gly (G) 7.26 Met (M) 2.38 Trp (W) 1.29 Asp (D) 5.45 His (H) 2.19 Phe (F) 3.92 Tyr (Y) 2.93 Cys (C) 1.21 Ile (I) 5.70 Pro (P) 4.85 Val (V) 6.88 ''' #unit_prot freq table of aminoacids 23/11/2017 e_freq_dict = { 'A': 0.091, 'R': 0.0571, 'N': 0.0388, 'D': 0.0545, 'C': 0.0121, 'Q': 0.0379, 'E': 0.0616, 'G': 0.0726, 'H': 0.0219, 'I': 0.0570, 'L': 0.0987, 'K': 0.0499, 'M': 0.0238, 'F': 0.0392, 'P': 0.0485, 'S': 0.0669, 'T': 0.0557, 'W': 0.0129, 'Y': 0.0293, 'V': 0.0688 } #e_freq_dict={'A': 0.175, 'B': 0.325, 'C': 0.5} e_freq_table = FreqTable.FreqTable(e_freq_dict, FreqTable.FREQ, alphabet=Alphabet.ProteinAlphabet()) #e_freq_table=None df = pandas.DataFrame() alignment = AlignIO.read(msa, "fasta", alphabet=Alphabet.ProteinAlphabet()) summary_align = AlignInfo.SummaryInfo(alignment) total_entropy, entropy_columns, freq_dict_columns = information_content( summary_align, e_freq_table=e_freq_table) '''Print File de resultados''' for i in range(len(entropy_columns.values())): freq_dict = freq_dict_columns[i] df_2 = pandas.DataFrame([freq_dict], columns=freq_dict.keys()) df_2['Entropy'] = entropy_columns[i] df = df.append(df_2, ignore_index=True) #df.set_value(i, 'Entropy' , entropy_columns[i]) df.to_csv(output)
def test_fileio(): ffName = 'test' from Bio import Alphabet alphabet = Alphabet.ProteinAlphabet() alphabet.size = 3 alphabet.letters = ['BB1', 'BB2'] inferAngles = True topPath = testFilePath testDB = generate(ffName, [alphabet], inferAngles, topPath=topPath) write_topology_database(testDB, 'test', [alphabet], outDir=testFilePath) result = read_topology_database('test', inDir=testFilePath) os.remove(testFilePath + 'test.xml') assert result['BB1']['vertices'] == [('A1', 'A'), ('A2', 'A'), ('A3', 'A'), ('A4', 'A')] assert result['BB1']['bondEdges'][('A1', 'A2')] == approx(1.2) assert result['BB1']['bondEdges'][('A2', 'A3')] == approx(1.0) assert result['BB1']['bondEdges'][('A3', 'A4')] == approx(1.1) assert result['BB1']['angleEdges'][('A1', 'A3')] == approx(1.90787884028338913, rel=1e-5) assert result['BB1']['angleEdges'][('A2', 'A4')] == approx(1.7719368430701863, rel=1e-5) assert result['BB1']['improperEdges'][('A1', 'A4')] == approx(2.065313144262336, rel=1e-5) return
def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False): self.entry = elem self.alphabet = alphabet self.return_raw_comments = return_raw_comments
def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False): """Generator function to parse UniProt XML as SeqRecord objects. parses an XML entry at a time from any UniProt XML file returns a SeqRecord for each iteration This generator can be used in Bio.SeqIO return_raw_comments = True --> comment fields are returned as complete XML to allow further processing skip_parsing_errors = True --> if parsing errors are found, skip to next entry """ if isinstance(alphabet, Alphabet.NucleotideAlphabet): raise ValueError("Wrong alphabet %r" % alphabet) if isinstance(alphabet, Alphabet.Gapped): if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): raise ValueError("Wrong alphabet %r" % alphabet) if not hasattr(handle, "read"): if isinstance(handle, str): handle = StringIO(handle) else: raise Exception('An XML-containing handler or an XML string must be passed') if ElementTree is None: from Bio import MissingExternalDependencyError raise MissingExternalDependencyError( "No ElementTree module was found. " "Use Python 2.5+, lxml or elementtree if you " "want to use Bio.SeqIO.UniprotIO.") for event, elem in ElementTree.iterparse(handle, events=("start", "end")): if event == "end" and elem.tag == NS + "entry": yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() elem.clear()
def test_generate(): ffname = 'test' from Bio import Alphabet alphabet = Alphabet.ProteinAlphabet() alphabet.size = 3 alphabet.letters = ['BB1', 'BB2'] inferAngles = True topPath = testFilePath result = ffparsergmx.generate(ffname, [alphabet], inferAngles, topPath=topPath) assert result['BB1']['vertices'] == [('A1', 'A'), ('A2', 'A'), ('A3', 'A'), ('A4', 'A')] assert result['BB1']['bondEdges'][('A1', 'A2')] == approx(1.2) assert result['BB1']['bondEdges'][('A2', 'A3')] == approx(1.0) assert result['BB1']['bondEdges'][('A3', 'A4')] == approx(1.1) assert result['BB1']['angleEdges'][('A1', 'A3')] == approx(1.90787884028338913, rel=1e-5) assert result['BB1']['angleEdges'][('A2', 'A4')] == approx(1.7719368430701863, rel=1e-5) assert result['BB1']['improperEdges']['A1', 'A4'] == approx(2.065313144262336) return
def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False): """Initialize the class.""" self.entry = elem self.alphabet = alphabet self.return_raw_comments = return_raw_comments
def conservation(msa_path): import numpy as np import scipy.stats as sc from Bio import AlignIO from Bio.Align import AlignInfo from Bio.Alphabet import IUPAC from Bio.SubsMat import FreqTable import Bio.Alphabet as Alphabet from Bio import motifs for filename in os.listdir(msa_path): if filename.endswith(".cluster"): alignment = AlignIO.read(msa_path + filename, "fasta", alphabet=Alphabet.ProteinAlphabet()) columns_quantity = [] columns_frequency = [] #summary_align = AlignInfo.SummaryInfo(alignment) #pssm = summary_align.pos_specific_score_matrix() #print pssm for x in range(0, len(alignment[0].seq) - 1): column = alignment[:, x] quantity = letters for f in column: print(f) quantity[f] += 1 double = 20 / len(alignment) print len(alignment) print(quantity) #frequency=list(map(lambda x: x/len(alignment), quantity)) frequency = dict( map(lambda (k, v): (k, v / len(alignment)), quantity.iteritems())) print frequency columns_quantity.append(quantity) columns_frequency.append(frequency) print(columns_quantity) ''' m = motifs.create(alignment,alphabet=Alphabet.ProteinAlphabet()) print (m) alfa = summary_align.alignment._alphabet base_alpha = Alphabet._get_base_alphabet(alfa) print(summary_align) print(alfa) print(base_alpha) data=summary_align.information_content(5,30) print(data)''' #n is the number of data points ''''n=10
def reduceSeq(infile, outfile, alph): rec = [] with open(infile, 'rU') as input_handle: for record in SeqIO.parse(input_handle, "fasta"): rec.append(record) new_p = [] for r in rec: n_p = Seq('', Alphabet.ProteinAlphabet()) if alph == 'Murphy10': n_p = Seq('', Alphabet.Reduced.Murphy10()) elif alph == 'Murphy15': n_p = Seq('', Alphabet.Reduced.Murphy15()) elif alph == 'Murphy8': n_p = Seq('', Alphabet.Reduced.Murphy8()) elif alph == 'Murphy4': n_p = Seq('', Alphabet.Reduced.Murphy4()) elif alph == 'PC5': n_p = Seq('', Alphabet.Reduced.PC5()) elif alph == 'HPModel': n_p = Seq('', Alphabet.Reduced.HPModel()) for aa in r: if aa != '*' and aa != '-' and aa != 'U': if aa == 'X': aa = random.sample(set('ACDEFGHIKLMNPQRSTVWY'), 1)[0] if alph == 'Murphy10': n_p += Alphabet.Reduced.murphy_10_tab[aa] elif alph == 'Murphy15': n_p += Alphabet.Reduced.murphy_15_tab[aa] elif alph == 'Murphy8': n_p += Alphabet.Reduced.murphy_8_tab[aa] elif alph == 'Murphy4': n_p += Alphabet.Reduced.murphy_4_tab[aa] elif alph == 'PC5': n_p += Alphabet.Reduced.pc_5_table[aa] elif alph == 'HPModel': n_p += Alphabet.Reduced.hp_model_tab[aa] else: n_p += aa x = SeqRecord(n_p) x.id = r.id x.description = r.description new_p.append(x) SeqIO.write(new_p, outfile, "fasta")
def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False, monoisotopic=False): """Calculates the molecular weight of a DNA, RNA or protein sequence. Only unambiguous letters are allowed. Nucleotide sequences are assumed to have a 5' phosphate. - seq: String or Biopython sequence object. - seq_type: The default (None) is to take the alphabet from the seq argument, or assume DNA if the seq argument is a string. Override this with a string 'DNA', 'RNA', or 'protein'. - double_stranded: Calculate the mass for the double stranded molecule? - circular: Is the molecule circular (has no ends)? - monoisotopic: Use the monoisotopic mass tables? Note that for backwards compatibility, if the seq argument is a string, or Seq object with a generic alphabet, and no seq_type is specified (i.e. left as None), then DNA is assumed. >>> print("%0.2f" % molecular_weight("AGC")) 949.61 >>> print("%0.2f" % molecular_weight(Seq("AGC"))) 949.61 However, it is better to be explicit - for example with strings: >>> print("%0.2f" % molecular_weight("AGC", "DNA")) 949.61 >>> print("%0.2f" % molecular_weight("AGC", "RNA")) 997.61 >>> print("%0.2f" % molecular_weight("AGC", "protein")) 249.29 Or, with the sequence alphabet: >>> from Bio.Seq import Seq >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna))) 949.61 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna))) 997.61 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein))) 249.29 Also note that contradictory sequence alphabets and seq_type will also give an exception: >>> from Bio.Seq import Seq >>> from Bio.Alphabet import generic_dna >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA")) Traceback (most recent call last): ... ValueError: seq_type='RNA' contradicts DNA from seq alphabet """ # Rewritten by Markus Piotrowski, 2014 # Find the alphabet type tmp_type = '' if isinstance(seq, Seq) or isinstance(seq, MutableSeq): base_alphabet = Alphabet._get_base_alphabet(seq.alphabet) if isinstance(base_alphabet, Alphabet.DNAAlphabet): tmp_type = 'DNA' elif isinstance(base_alphabet, Alphabet.RNAAlphabet): tmp_type = 'RNA' elif isinstance(base_alphabet, Alphabet.ProteinAlphabet): tmp_type = 'protein' elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein): tmp_type = 'protein' # Convert to one-letter sequence. Have to use a string for seq1 seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet()) elif not isinstance(base_alphabet, Alphabet.Alphabet): raise TypeError("%s is not a valid alphabet for mass calculations" % base_alphabet) else: tmp_type = "DNA" # backward compatibity if seq_type and tmp_type and tmp_type != seq_type: raise ValueError("seq_type=%r contradicts %s from seq alphabet" % (seq_type, tmp_type)) seq_type = tmp_type elif isinstance(seq, str): if seq_type is None: seq_type = "DNA" # backward compatibity else: raise TypeError("Expected a string or Seq object, not seq=%r" % seq) seq = ''.join(str(seq).split()).upper() # Do the minimum formatting if seq_type == 'DNA': if monoisotopic: weight_table = IUPACData.monoisotopic_unambiguous_dna_weights else: weight_table = IUPACData.unambiguous_dna_weights elif seq_type == 'RNA': if monoisotopic: weight_table = IUPACData.monoisotopic_unambiguous_rna_weights else: weight_table = IUPACData.unambiguous_rna_weights elif seq_type == 'protein': if monoisotopic: weight_table = IUPACData.monoisotopic_protein_weights else: weight_table = IUPACData.protein_weights else: raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" % seq_type) if monoisotopic: water = 18.010565 else: water = 18.0153 try: weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * water if circular: weight -= water except KeyError as e: raise ValueError('%s is not a valid unambiguous letter for %s' %(e, seq_type)) except: raise if seq_type in ('DNA', 'RNA') and double_stranded: seq = str(Seq(seq).complement()) weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * water if circular: weight -= water elif seq_type == 'protein' and double_stranded: raise ValueError('double-stranded proteins await their discovery') return weight
fasta = tempfile.NamedTemporaryFile() idents = sorted(os.listdir('proteins')) for ident in idents: fasta.write(open('proteins/' + ident).read()) fasta.flush() print 'Calculating multiple alignment...' aligner = subprocess.Popen([ "/tmp/muscle3.8.31_i86linux64", "-clwstrict", "-in", fasta.name, "-out", "-" ], stdout=subprocess.PIPE) stdout, stderr = aligner.communicate() align = AlignIO.read(StringIO(stdout), 'clustal', alphabet=Alphabet.ProteinAlphabet()) from Bio.SubsMat import MatrixInfo def score_match(pair, matrix): if pair not in matrix: return matrix[(tuple(reversed(pair)))] else: return matrix[pair] def score_pairwise(seq1, seq2, matrix, gap_s, gap_e): score = 0 gap = False for i in range(len(seq1)):