示例#1
0
 def __init__(self, AS_msa, SH_msa):
     self.AS_msa = AS_msa
     self.SH_msa = SH_msa
     self._AS_color = 'blue'
     self._SH_color = 'red'
     self.AS_size = len(bioinf.split_fasta(self.AS_msa)[1])
     self.SH_size = len(bioinf.split_fasta(self.SH_msa)[1])
示例#2
0
 def __init__(self, cbh_msa, egl_msa):
     self.cbh_msa = cbh_msa
     self.egl_msa = egl_msa
     self._cbh_color = 'blue'
     self._egl_color = 'red'
     self.cbh_size = len(bioinf.split_fasta(self.cbh_msa)[1])
     self.egl_size = len(bioinf.split_fasta(self.egl_msa)[1])
示例#3
0
def kfold_split(fasta, k, path, root='kfold'):
    '''Spilt sequences in fasta file into k-folds and save them as k-separate fasta files
    for k-fold training and testing (saved as ${root}_train1.fasta, ${root}_test1.fasta, 
    etc.)'''
    
    [h,s] = bioinf.split_fasta(fasta)
    kf = KFold(n_splits=10, random_state=0, shuffle=True)
    dummy=0
    for train_index, test_index in kf.split(range(len(s))):
        dummy += 1
        bioinf.combine_fasta([h[x] for x in train_index], [s[x] for x in train_index], 
                             f'{path}/{root}_train{dummy}.fasta')
        bioinf.combine_fasta([h[x] for x in test_index], [s[x] for x in test_index], 
                             f'{path}/{root}_test{dummy}.fasta')
示例#4
0
def get_gh7looplength(msafasta, trecel7a_pos=0):
    ''' Return a DataFrame of the number of residues in the 8 loops of GH7 sequences in 
    an MSA fasta file. TreCel7A is used as reference for determining the loop positions
    in the MSA. The position of TreCel7A in the fasta file is trecel7a_pos (0 if first).
    Loop lengths are in the order [A1, A2, A3, A4, B1, B2, B3, B4]. '''
    
    # Loop residues in TreCel7A
    loopres = ['QSAQK', 'TSSGVPAQVESQS', 'DYYAN', 'TNETSSTPGA',
               'YDGNTW', 'PSSNNANT', 'GGTYSDNRYG', 'GGSS']  # Residues in the loops of TreCel7A
    loopmore = ['NVGARLY', 'PNAKVTFSNIK', 'MLWLDST', 'VRGSCSTSSGVPA',
                'SSTLCPD', 'GIGGHGSCCS', 'GTCDPDGCDWNP', 'FSDKGGL'] # Residues after the loops
    
    # Get aligned sequences
    [heads, sequences] = bioinf.split_fasta(msafasta)   # Retrieve sequences from fasta file 
    trecel7a_seq_msa = sequences[trecel7a_pos]
    trecel7a_nogaps = trecel7a_seq_msa.replace('-','')
    trecel7a_list = list(trecel7a_seq_msa)
    
    # Get loop positions in MSA (using TreCel7A as reference)
    numb = -1
    for k in range(len(trecel7a_list)):
        if trecel7a_list[k].isalpha():
            numb += 1
            trecel7a_list[k] = str(numb)
    startpos = [trecel7a_list.index(str(trecel7a_nogaps.index(loopres[i]))) 
                    for i in range(len(loopres))]
    stoppos = [trecel7a_list.index(str(trecel7a_nogaps.index(loopmore[i]))) 
                    for i in range(len(loopmore))]
    length = [stoppos[i] - startpos[i] for i in range(len(startpos))]
    
    # Determine  loop length
    store = []
    for i in range(len(sequences)):
        seq = sequences[i]
        loopregion = [seq[startpos[k]:stoppos[k]] for k in range(len(loopres))]
        looplength = [length[k] - loopregion[k].count('-') for k in range(len(loopres))]
        store.append(looplength)
        
    # Save results as DataFrame
    result = pd.DataFrame(store)
    result.columns = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4']
    return result
示例#5
0
from scipy import stats

from Bio.PDB.PDBParser import PDBParser
from Bio.PDB import Selection
import matplotlib.pyplot as plt

import bioinformatics as bioinf

import warnings
warnings.filterwarnings("ignore")

# Prepare sequences and data
#=================================#
if __name__ == '__main__':
    # Get MSA with only TreCel7A positions for analysis
    heads, sequences = bioinf.split_fasta('fasta/structure_based_alignment/' \
                                          '/cel7_nr99_structaln.fasta')
    trecel7a_seq = sequences[0]
    trecel7a_positions = [
        x for x in range(len(trecel7a_seq)) if trecel7a_seq[x].isalpha()
    ]
    sequences_treonly = []
    for i in range(len(sequences)):
        seq = list(sequences[i])
        seq = [seq[x] for x in trecel7a_positions]
        seq = ''.join(seq)
        sequences_treonly.append(seq)
    bioinf.combine_fasta(heads, sequences_treonly, 'fasta/trecel7a_positions_only/' \
                         'cel7_all.fasta')

    # Separate sequences in MSA to two sub-MSAs (CBH and EG)
    subtype = list(
示例#6
0
from keras.utils import plot_model
from keras.callbacks import ReduceLROnPlateau
import os

import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from keras.models import load_model

# load train and test dataset
h, sequences = bioinf.split_fasta('fasta/GH13_positions_only/GH13_cat.fasta')
heads=bioinf.get_accession('fasta/GH13_positions_only/GH13_cat.fasta')
subtype = list(pd.read_csv('results_final/ncbi_subtypes.csv')['ncbi_pred_class'])
lb = LabelBinarizer()
y = lb.fit_transform(subtype)
y = to_categorical(y)
cat_domain_fasta = 'fasta/GH13_positions_only/GH13_cat.fasta'
sequence_df = bioinf.fasta_to_df(cat_domain_fasta)
max_length = len(sequence_df.columns)
embedding_dim = 11
top_classes=2
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(sequences)

X_seq = tokenizer.texts_to_sequences(sequences)
X_seq = sequence.pad_sequences(X_seq, maxlen=max_length)
示例#7
0
# Distribution at 39 important positions
#==========================================#

plt.rcParams['figure.figsize'] = [7, 4]
for i in range(len(positions)):
    GH13msa.site_plot(site=positions[i],
                      savefig=True,
                      savepath='plots/position_distribution')

# Aromatic residues within 6Å of substrate (and consensus AS and SH)
#==============================================================================#

GH13msa.get_consensus_sequences()
AS_consensus = list(GH13msa.consensus_AS)
SH_consensus = list(GH13msa.consensus_SH)
Np = bioinf.split_fasta('fasta/GH13_positions_only/consensus.fasta')[1][1]
excel = pd.read_csv('results_final/residue_distances.csv', index_col=0)
closest_subsite = list(excel.iloc[:, 0])
distances = list(excel.iloc[:, 1])

resid_aro, Np_aro, AS_aro, SH_aro, closest_subsite_aro, dist_aro = [],[],[],[],[],[]
AS_aro_freq, SH_aro_freq, conserved = [], [], []
aro_res = ['F', 'W', 'Y', 'H']

for i in range(len(Np)):
    if (Np[i] in aro_res or AS_consensus[i] in  aro_res or SH_consensus[i] in aro_res)\
    and distances[i]<=6.0:
        resid_aro.append(i + 1)
        Np_aro.append(Np[i])
        AS_aro.append(AS_consensus[i])
        SH_aro.append(SH_consensus[i])
示例#8
0
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB import Selection
import matplotlib.pyplot as plt

import bioinformatics as bioinf

import warnings

warnings.filterwarnings("ignore")

# Prepare sequences and data
#=================================#
if __name__ == '__main__':
    # Get MSA with only GH13 positions for analysis
    heads, sequences = bioinf.split_fasta(
        'fasta/subtype/alignments/nrblast_all_msa_s.fasta')
    GH13_seq = sequences[0]
    GH13_positions = [x for x in range(len(GH13_seq)) if GH13_seq[x].isalpha()]
    sequences_treonly = []
    for i in range(len(sequences)):
        seq = list(sequences[i])
        seq = [seq[x] for x in GH13_positions]
        seq = ''.join(seq)
        sequences_treonly.append(seq)
    bioinf.combine_fasta(heads, sequences_treonly, 'fasta/GH13_positions_only/' \
                         'GH13_all.fasta')

    # Separate sequences in MSA to two sub-MSAs (AS and SH)
    subtype = list(
        pd.read_csv('results_final/ncbi_subtypes.csv')['ncbi_pred_class'])
    AS_pos = [x for x in range(len(subtype)) if subtype[x] == 1]
示例#9
0
# Pymol commands for loop positions in TreCel7A and TreCel7B 
#==============================================================#

# Cel7A
loopstart = [98, 399, 369, 383, 51, 194, 244, 339]
length = [5,13,5,10,6,8,10,4]
cel7a_start = list(loopstart)
cel7a_stop = [loopstart[i] + length[i] - 1 for i in range(8)]
cel7a_pymol = 'select cel7a_loops, '
for i in range(8):
    cel7a_pymol += f'resi {cel7a_start[i]}-{cel7a_stop[i]} or '

# Cel7B
fasta = 'fasta/structure_based_alignment/structure6_mafft.fasta'
heads, seqs = bioinf.split_fasta(fasta)
seq7a_msa, seq7b_msa = seqs[0], seqs[3]
seq7a, seq7b = seq7a_msa.replace('-', ''), seq7b_msa.replace('-','')
msastart = [bioinf.resid_to_msa(seq7a_msa, x-1) for x in cel7a_start]
msastop = [bioinf.resid_to_msa(seq7a_msa, x-1) for x in cel7a_stop]
cel7b_start = [bioinf.msa_to_resid(seq7b_msa, x) for x in msastart]
cel7b_stop = [bioinf.msa_to_resid(seq7b_msa, x+1) for x in msastop]
cel7b_pymol = 'select cel7b_loops, '
for i in range(8):
    cel7b_pymol += f'resi {cel7b_start[i] + 1}-{cel7b_stop[i]} or '


# Write
with open('plots/loops_pymol.txt', 'w') as pymol:
    pymol.write(cel7a_pymol[:-4] + '\n\n')
    pymol.write(cel7b_pymol[:-4])