def KNN_classification(dataset, filename): """ Classification of data with k-nearest neighbors, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. Returns --- stats: array containing classification accuracy, precision and recall """ # Import and one hot encode training/test set X_train, X_test, y_train, y_test = prepare_data(dataset) # Fitting classifier to the training set KNN_classifier = KNeighborsClassifier( n_neighbors=100, metric='minkowski', p=2) KNN_classifier.fit(X_train, y_train) # Predicting the test set results y_pred = KNN_classifier.predict(X_test) y_score = KNN_classifier.predict_proba(X_test) # ROC curve title = 'KNN ROC curve (Train={})'.format(filename) plot_ROC_curve( y_test, y_score[:, 1], plot_title=title, plot_dir='figures/KNN_ROC_Test_{}.png'.format(filename) ) # Precision-recall curve title = 'KNN Precision-Recall curve (Train={})'.format(filename) plot_PR_curve( y_test, y_score[:, 1], plot_title=title, plot_dir='figures/KNN_P-R_Test_{}.png'.format(filename) ) # Calculate statistics stats = calc_stat(y_test, y_pred) # Return statistics return stats
def LogReg_classification(dataset, filename): """ Classification of data with logistic regression, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. Returns --- stats: array containing classification accuracy, precision and recall """ # Import and one hot encode training/test set X_train, X_test, y_train, y_test = prepare_data(dataset) # Fitting Logistic Regression to the training set LR_classifier = LogisticRegression(random_state=0) LR_classifier.fit(X_train, y_train) # Predicting the test set results y_pred = LR_classifier.predict(X_test) y_score = LR_classifier.predict_proba(X_test) # ROC curve title = 'Logistic Regression ROC curve (Train={})'.format(filename) plot_ROC_curve(y_test, y_score[:, 1], plot_title=title, plot_dir='figures/LR_ROC_Test_{}.png'.format(filename)) # Precision-recall curve title = 'Logistic Regression Precision-Recall curve (Train={})'.format( filename) plot_PR_curve(y_test, y_score[:, 1], plot_title=title, plot_dir='figures/LR_P-R_Test_{}.png'.format(filename)) # Calculate statistics stats = calc_stat(y_test, y_pred) # Return statistics return stats
def SVM_classification(dataset, filename): """ Classification of data with support vectors, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. Returns --- stats: array containing classification accuracy, precision and recall """ # Import and one hot encode training/test set X_train, X_test, y_train, y_test = prepare_data(dataset) # Fitting classifier to the training set SVM_classifier = SVC(kernel='rbf') SVM_classifier.fit(X_train, y_train) # Predicting the test set results y_pred = SVM_classifier.predict(X_test) y_score = SVM_classifier.decision_function(X_test) # ROC curve title = 'SVM ROC curve (Train={})'.format(filename) plot_ROC_curve( y_test, y_score, plot_title=title, plot_dir='figures/SVM_ROC_Test_{}.png'.format(filename) ) # Precision-recall curve title = 'SVM Precision-Recall curve (Train={})'.format(filename) plot_PR_curve( y_test, y_score, plot_title=title, plot_dir='figures/SVM_P-R_Test_{}.png'.format(filename) ) # Calculate statistics stats = calc_stat(y_test, y_pred) # Return statistics return stats
def LogReg2D_classification(dataset, filename): """ Classification of data with 2D logistic regression, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. Returns --- stats: array containing classification accuracy, precision and recall """ # Import training/test set X_train = dataset.train.loc[:, 'AASeq'].values X_test = dataset.test.loc[:, 'AASeq'].values # One hot encode the sequences in 2D X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train] X_train_2D_list = [] for x in range(0, len(X_train)): X_train_2D = np.empty([20, 0]) for y in range(0, X_train[x].shape[1] - 1): for z in range(0, X_train[x].shape[0]): X_train_2D = np.concatenate( (X_train_2D, X_train[x][z, y] * X_train[x][:, y + 1:]), axis=1) X_train_2D_list.append(X_train_2D) X_train = [x.flatten('F') for x in X_train_2D_list] X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test] X_test_2D_list = [] for x in range(0, len(X_test)): X_test_2D = np.empty([20, 0]) for y in range(0, X_test[x].shape[1] - 1): for z in range(0, X_test[x].shape[0]): X_test_2D = np.concatenate( (X_test_2D, X_test[x][z, y] * X_test[x][:, y + 1:]), axis=1) X_test_2D_list.append(X_test_2D) X_test = [x.flatten('F') for x in X_test_2D_list] # Extract labels of training/test set y_train = dataset.train.loc[:, 'AgClass'].values y_test = dataset.test.loc[:, 'AgClass'].values # Fitting Logistic Regression to the training set LR_classifier = LogisticRegression(random_state=0) LR_classifier.fit(X_train, y_train) # Predicting the test set results y_pred = LR_classifier.predict(X_test) y_score = LR_classifier.predict_proba(X_test) # ROC curve title = '2D Logistic Regression ROC curve (Train={})'.format(filename) plot_ROC_curve(y_test, y_score[:, 1], plot_title=title, plot_dir='figures/2DLR_ROC_Test_{}.png'.format(filename)) # Precision-recall curve title = '2D Logistic Regression Precision-Recall curve (Train={})'.format( filename) plot_PR_curve(y_test, y_score[:, 1], plot_title=title, plot_dir='figures/2DLR_P-R_Test_{}.png'.format(filename)) # Calculate statistics stats = calc_stat(y_test, y_pred) # Return statistics return stats
def RNN_classification(dataset, filename, save_model=False): """ Classification of data with a recurrent neural network, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. save_model: optional; if provided, should specify the directory to save model summary and weights. The classification model will be returned in this case. If False, an array containing classification accuracy, precision and recall will be returned instead. """ # Import training/test set X_train = dataset.train.loc[:, 'AASeq'].values X_test = dataset.test.loc[:, 'AASeq'].values X_val = dataset.val.loc[:, 'AASeq'].values # One hot encode the sequences X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train] X_train = np.transpose(np.asarray(X_train), (0, 2, 1)) X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test] X_test = np.transpose(np.asarray(X_test), (0, 2, 1)) X_val = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_val] X_val = np.transpose(np.asarray(X_val), (0, 2, 1)) # Extract labels of training/test/validation set y_train = dataset.train.loc[:, 'AgClass'].values y_test = dataset.test.loc[:, 'AgClass'].values y_val = dataset.val.loc[:, 'AgClass'].values # Building the RNN RNN_classifier = create_rnn() # Compiling the RNN RNN_classifier.compile( optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'] ) # Fit the RNN to the training set _ = RNN_classifier.fit( x=X_train, y=y_train, shuffle=True, validation_data=(X_val, y_val), epochs=20, batch_size=32, verbose=2 ) # Predicting the test set results y_pred = RNN_classifier.predict(x=X_test) # ROC curve title = 'RNN ROC curve (Train={})'.format(filename) plot_ROC_curve( y_test, y_pred, plot_title=title, plot_dir='figures/RNN_ROC_Test_{}.png'.format(filename) ) # Precision-recall curve title = 'RNN Precision-Recall curve (Train={})'.format(filename) plot_PR_curve( y_test, y_pred, plot_title=title, plot_dir='figures/RNN_P-R_Test_{}.png'.format(filename) ) # Save model if specified if save_model: # Model summary with open(os.path.join(save_model, 'RNN_summary.txt'), 'w') as f: with redirect_stdout(f): RNN_classifier.summary() # Model weights RNN_classifier.save( os.path.join(save_model, 'RNN_HER2') ) # Return classification model return RNN_classifier else: # Probabilities larger than 0.5 are significant y_pred_stand = (y_pred > 0.5) # Calculate statistics stats = calc_stat(y_test, y_pred_stand) # Return statistics return stats
def CNN_classification(dataset, filename, save_model=False, params=None): """ Classification of data with a convolutional neural network, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. save_model: optional; if provided, should specify the directory to save model summary and weights. The classification model will be returned in this case. If False, an array containing classification accuracy, precision and recall will be returned instead. params: optional; if provided, should specify the optimized model parameters that were determined in a separate model tuning step. If None, model parameters are hard-coded. """ # Import training/test set X_train = dataset.train.loc[:, 'AASeq'].values X_test = dataset.test.loc[:, 'AASeq'].values X_val = dataset.val.loc[:, 'AASeq'].values # One hot encode the sequences X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train] X_train = np.transpose(np.asarray(X_train), (0, 2, 1)) X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test] X_test = np.transpose(np.asarray(X_test), (0, 2, 1)) X_val = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_val] X_val = np.transpose(np.asarray(X_val), (0, 2, 1)) # Extract labels of training/test/validation set y_train = dataset.train.loc[:, 'AgClass'].values y_test = dataset.test.loc[:, 'AgClass'].values y_val = dataset.val.loc[:, 'AgClass'].values # Set parameters for CNN if not params: params = [['CONV', 400, 3, 1], ['DROP', 0.5], ['POOL', 2, 1], ['FLAT'], ['DENSE', 50]] # Create the CNN with above-specified parameters CNN_classifier = create_cnn(params, (10, 20), 'relu', None) # Compiling the CNN opt = Adam(learning_rate=0.000075) CNN_classifier.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) # Fit the CNN to the training set _ = CNN_classifier.fit(x=X_train, y=y_train, shuffle=True, validation_data=(X_val, y_val), epochs=20, batch_size=16, verbose=2) # Predicting the test set results y_pred = CNN_classifier.predict(x=X_test) # ROC curve title = 'CNN ROC curve (Train={})'.format(filename) plot_ROC_curve(y_test, y_pred, plot_title=title, plot_dir='figures/CNN_ROC_Test_{}.png'.format(filename)) # Precision-recall curve title = 'CNN Precision-Recall curve (Train={})'.format(filename) plot_PR_curve(y_test, y_pred, plot_title=title, plot_dir='figures/CNN_P-R_Test_{}.png'.format(filename)) # Save model if specified if save_model: # Model summary with open(os.path.join(save_model, 'CNN_summary.txt'), 'w') as f: with redirect_stdout(f): CNN_classifier.summary() # Model weights CNN_classifier.save(os.path.join(save_model, 'CNN_HER2')) # Return classification model return CNN_classifier else: # Probabilities larger than 0.5 are significant y_pred_stand = (y_pred > 0.5) # Calculate statistics stats = calc_stat(y_test, y_pred_stand) # Return statistics return stats