def __init__(self, AS_msa, SH_msa): self.AS_msa = AS_msa self.SH_msa = SH_msa self._AS_color = 'blue' self._SH_color = 'red' self.AS_size = len(bioinf.split_fasta(self.AS_msa)[1]) self.SH_size = len(bioinf.split_fasta(self.SH_msa)[1])
def __init__(self, cbh_msa, egl_msa): self.cbh_msa = cbh_msa self.egl_msa = egl_msa self._cbh_color = 'blue' self._egl_color = 'red' self.cbh_size = len(bioinf.split_fasta(self.cbh_msa)[1]) self.egl_size = len(bioinf.split_fasta(self.egl_msa)[1])
def kfold_split(fasta, k, path, root='kfold'): '''Spilt sequences in fasta file into k-folds and save them as k-separate fasta files for k-fold training and testing (saved as ${root}_train1.fasta, ${root}_test1.fasta, etc.)''' [h,s] = bioinf.split_fasta(fasta) kf = KFold(n_splits=10, random_state=0, shuffle=True) dummy=0 for train_index, test_index in kf.split(range(len(s))): dummy += 1 bioinf.combine_fasta([h[x] for x in train_index], [s[x] for x in train_index], f'{path}/{root}_train{dummy}.fasta') bioinf.combine_fasta([h[x] for x in test_index], [s[x] for x in test_index], f'{path}/{root}_test{dummy}.fasta')
def get_gh7looplength(msafasta, trecel7a_pos=0): ''' Return a DataFrame of the number of residues in the 8 loops of GH7 sequences in an MSA fasta file. TreCel7A is used as reference for determining the loop positions in the MSA. The position of TreCel7A in the fasta file is trecel7a_pos (0 if first). Loop lengths are in the order [A1, A2, A3, A4, B1, B2, B3, B4]. ''' # Loop residues in TreCel7A loopres = ['QSAQK', 'TSSGVPAQVESQS', 'DYYAN', 'TNETSSTPGA', 'YDGNTW', 'PSSNNANT', 'GGTYSDNRYG', 'GGSS'] # Residues in the loops of TreCel7A loopmore = ['NVGARLY', 'PNAKVTFSNIK', 'MLWLDST', 'VRGSCSTSSGVPA', 'SSTLCPD', 'GIGGHGSCCS', 'GTCDPDGCDWNP', 'FSDKGGL'] # Residues after the loops # Get aligned sequences [heads, sequences] = bioinf.split_fasta(msafasta) # Retrieve sequences from fasta file trecel7a_seq_msa = sequences[trecel7a_pos] trecel7a_nogaps = trecel7a_seq_msa.replace('-','') trecel7a_list = list(trecel7a_seq_msa) # Get loop positions in MSA (using TreCel7A as reference) numb = -1 for k in range(len(trecel7a_list)): if trecel7a_list[k].isalpha(): numb += 1 trecel7a_list[k] = str(numb) startpos = [trecel7a_list.index(str(trecel7a_nogaps.index(loopres[i]))) for i in range(len(loopres))] stoppos = [trecel7a_list.index(str(trecel7a_nogaps.index(loopmore[i]))) for i in range(len(loopmore))] length = [stoppos[i] - startpos[i] for i in range(len(startpos))] # Determine loop length store = [] for i in range(len(sequences)): seq = sequences[i] loopregion = [seq[startpos[k]:stoppos[k]] for k in range(len(loopres))] looplength = [length[k] - loopregion[k].count('-') for k in range(len(loopres))] store.append(looplength) # Save results as DataFrame result = pd.DataFrame(store) result.columns = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] return result
from scipy import stats from Bio.PDB.PDBParser import PDBParser from Bio.PDB import Selection import matplotlib.pyplot as plt import bioinformatics as bioinf import warnings warnings.filterwarnings("ignore") # Prepare sequences and data #=================================# if __name__ == '__main__': # Get MSA with only TreCel7A positions for analysis heads, sequences = bioinf.split_fasta('fasta/structure_based_alignment/' \ '/cel7_nr99_structaln.fasta') trecel7a_seq = sequences[0] trecel7a_positions = [ x for x in range(len(trecel7a_seq)) if trecel7a_seq[x].isalpha() ] sequences_treonly = [] for i in range(len(sequences)): seq = list(sequences[i]) seq = [seq[x] for x in trecel7a_positions] seq = ''.join(seq) sequences_treonly.append(seq) bioinf.combine_fasta(heads, sequences_treonly, 'fasta/trecel7a_positions_only/' \ 'cel7_all.fasta') # Separate sequences in MSA to two sub-MSAs (CBH and EG) subtype = list(
from keras.utils import plot_model from keras.callbacks import ReduceLROnPlateau import os import tensorflow as tf from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.preprocessing import label_binarize from sklearn.metrics import roc_curve from sklearn.metrics import auc from sklearn.metrics import confusion_matrix from keras.models import load_model # load train and test dataset h, sequences = bioinf.split_fasta('fasta/GH13_positions_only/GH13_cat.fasta') heads=bioinf.get_accession('fasta/GH13_positions_only/GH13_cat.fasta') subtype = list(pd.read_csv('results_final/ncbi_subtypes.csv')['ncbi_pred_class']) lb = LabelBinarizer() y = lb.fit_transform(subtype) y = to_categorical(y) cat_domain_fasta = 'fasta/GH13_positions_only/GH13_cat.fasta' sequence_df = bioinf.fasta_to_df(cat_domain_fasta) max_length = len(sequence_df.columns) embedding_dim = 11 top_classes=2 tokenizer = Tokenizer(char_level=True) tokenizer.fit_on_texts(sequences) X_seq = tokenizer.texts_to_sequences(sequences) X_seq = sequence.pad_sequences(X_seq, maxlen=max_length)
# Distribution at 39 important positions #==========================================# plt.rcParams['figure.figsize'] = [7, 4] for i in range(len(positions)): GH13msa.site_plot(site=positions[i], savefig=True, savepath='plots/position_distribution') # Aromatic residues within 6Å of substrate (and consensus AS and SH) #==============================================================================# GH13msa.get_consensus_sequences() AS_consensus = list(GH13msa.consensus_AS) SH_consensus = list(GH13msa.consensus_SH) Np = bioinf.split_fasta('fasta/GH13_positions_only/consensus.fasta')[1][1] excel = pd.read_csv('results_final/residue_distances.csv', index_col=0) closest_subsite = list(excel.iloc[:, 0]) distances = list(excel.iloc[:, 1]) resid_aro, Np_aro, AS_aro, SH_aro, closest_subsite_aro, dist_aro = [],[],[],[],[],[] AS_aro_freq, SH_aro_freq, conserved = [], [], [] aro_res = ['F', 'W', 'Y', 'H'] for i in range(len(Np)): if (Np[i] in aro_res or AS_consensus[i] in aro_res or SH_consensus[i] in aro_res)\ and distances[i]<=6.0: resid_aro.append(i + 1) Np_aro.append(Np[i]) AS_aro.append(AS_consensus[i]) SH_aro.append(SH_consensus[i])
from Bio.PDB.PDBParser import PDBParser from Bio.PDB import Selection import matplotlib.pyplot as plt import bioinformatics as bioinf import warnings warnings.filterwarnings("ignore") # Prepare sequences and data #=================================# if __name__ == '__main__': # Get MSA with only GH13 positions for analysis heads, sequences = bioinf.split_fasta( 'fasta/subtype/alignments/nrblast_all_msa_s.fasta') GH13_seq = sequences[0] GH13_positions = [x for x in range(len(GH13_seq)) if GH13_seq[x].isalpha()] sequences_treonly = [] for i in range(len(sequences)): seq = list(sequences[i]) seq = [seq[x] for x in GH13_positions] seq = ''.join(seq) sequences_treonly.append(seq) bioinf.combine_fasta(heads, sequences_treonly, 'fasta/GH13_positions_only/' \ 'GH13_all.fasta') # Separate sequences in MSA to two sub-MSAs (AS and SH) subtype = list( pd.read_csv('results_final/ncbi_subtypes.csv')['ncbi_pred_class']) AS_pos = [x for x in range(len(subtype)) if subtype[x] == 1]
# Pymol commands for loop positions in TreCel7A and TreCel7B #==============================================================# # Cel7A loopstart = [98, 399, 369, 383, 51, 194, 244, 339] length = [5,13,5,10,6,8,10,4] cel7a_start = list(loopstart) cel7a_stop = [loopstart[i] + length[i] - 1 for i in range(8)] cel7a_pymol = 'select cel7a_loops, ' for i in range(8): cel7a_pymol += f'resi {cel7a_start[i]}-{cel7a_stop[i]} or ' # Cel7B fasta = 'fasta/structure_based_alignment/structure6_mafft.fasta' heads, seqs = bioinf.split_fasta(fasta) seq7a_msa, seq7b_msa = seqs[0], seqs[3] seq7a, seq7b = seq7a_msa.replace('-', ''), seq7b_msa.replace('-','') msastart = [bioinf.resid_to_msa(seq7a_msa, x-1) for x in cel7a_start] msastop = [bioinf.resid_to_msa(seq7a_msa, x-1) for x in cel7a_stop] cel7b_start = [bioinf.msa_to_resid(seq7b_msa, x) for x in msastart] cel7b_stop = [bioinf.msa_to_resid(seq7b_msa, x+1) for x in msastop] cel7b_pymol = 'select cel7b_loops, ' for i in range(8): cel7b_pymol += f'resi {cel7b_start[i] + 1}-{cel7b_stop[i]} or ' # Write with open('plots/loops_pymol.txt', 'w') as pymol: pymol.write(cel7a_pymol[:-4] + '\n\n') pymol.write(cel7b_pymol[:-4])