def main(): data = load_data() if ("deepsol1" in static_args.parameter_setting_id): x_train, y_train = data['train']['src'], data['train']['tgt'] dynamic_args['num_classes'] = len(set(y_train)) x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen)) y_train = utils.get_one_hot(y_train, dynamic_args['num_classes']) print('Training data: ', x_train.shape) x_test, y_test = data['test']['src'], data['test']['tgt'] x_test = np.array(utils.pad_sequecnes(x_test, static_args.maxlen)) y_test = utils.get_one_hot(y_test, dynamic_args['num_classes']) print('Test data: ', x_test.shape) else: x_train, x_train_bio, y_train = data['train']['src'], data['train'][ 'src_bio'], data['train']['tgt'] dynamic_args['num_classes'] = len(set(y_train)) x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen)) x_train_bio = np.array(x_train_bio)[:, :-1] dynamic_args['num_bio_feats'] = int(x_train_bio.shape[1]) y_train = utils.get_one_hot(y_train, dynamic_args['num_classes']) print('Training data: ', x_train.shape) print('Training data Bio: ', x_train_bio.shape) print(x_train_bio[1]) x_test, x_test_bio, y_test = data['test']['src'], data['test'][ 'src_bio'], data['test']['tgt'] x_test = np.array(utils.pad_sequecnes(x_test, static_args.maxlen)) x_test_bio = np.array(x_test_bio)[:, :-1] y_test = utils.get_one_hot(y_test, dynamic_args['num_classes']) print('Test data: ', x_test.shape) print('Test data Bio: ', x_test_bio.shape) best_model = utils.load_model(get_model_path()) if ("deepsol1" in static_args.parameter_setting_id): [pred_test, pred_prob_test ] = get_classification_prediction(best_model, x_test, y_test) else: [pred_test, pred_prob_test ] = get_classification_prediction(best_model, [x_test, x_test_bio], y_test) print('Finished testing') #save ion disk get_classification_performance_path() save_classification_prediction(pred_test, pred_prob_test)
def main(): data = load_data() x_train, y_train = data['train']['src'], data['train']['tgt'] dynamic_args['num_classes'] = len(set(y_train)) x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen)) y_train = utils.get_one_hot(y_train, dynamic_args['num_classes']) print('Training data: ', x_train.shape) x_val, y_val = data['valid']['src'], data['valid']['tgt'] x_val = np.array(utils.pad_sequecnes(x_val, static_args.maxlen)) y_val = utils.get_one_hot(y_val, dynamic_args['num_classes']) print('Validation data: ', x_val.shape) x_test, y_test = data['test']['src'], data['test']['tgt'] x_test = np.array(utils.pad_sequecnes(x_test, static_args.maxlen)) y_test = utils.get_one_hot(y_test, dynamic_args['num_classes']) print('Test data: ', x_test.shape) model = Models.DeepSol(static_args, dynamic_args).fetch_model_def() model.compile(loss='binary_crossentropy', optimizer=get_optimizer(), metrics=['accuracy']) print(model.summary()) # Training model.fit(x_train, y_train, batch_size=dynamic_args['batch_size'], epochs=int(static_args.epochs), validation_data=(x_val, y_val), callbacks=get_callbacks()) # Load the best model. Model that performs best on the validation data best_model = utils.load_model(get_model_path()) # Calculate classification performance of the best model on the val and test [acc_val, score_report_val, cm_val, pred_val] = get_classification_performance(best_model, x_val, y_val) [acc_test, score_report_test, cm_test, pred_test] = get_classification_performance(best_model, x_test, y_test) results_filename_with_path = get_classification_performance_path() if os.path.exists(results_filename_with_path): os.remove(results_filename_with_path) # save on disk save_classification_performance('Validation Accuracy: ', str(acc_val)) save_classification_performance('Test Accuracy: ', str(acc_test)) save_classification_performance('Score Report Test: : ', str(score_report_test)) save_classification_performance('Confusion Matrix test: ', str(cm_test)) save_classification_prediction(pred_test)
def main(): data = load_data() x_train, x_train_bio, y_train = data['train']['src'], data['train'][ 'src_bio'], data['train']['tgt'] dynamic_args['num_classes'] = len(set(y_train)) class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen)) x_train_bio = np.array(x_train_bio)[:, :-1] dynamic_args['num_bio_feats'] = int(x_train_bio.shape[1]) y_train = utils.get_one_hot(y_train, dynamic_args['num_classes']) print('Training data: ', x_train.shape) print('Training data Bio: ', x_train_bio.shape) print(x_train_bio[1]) x_val, x_val_bio, y_val = data['valid']['src'], data['valid'][ 'src_bio'], data['valid']['tgt'] x_val = np.array(utils.pad_sequecnes(x_val, static_args.maxlen)) x_val_bio = np.array(x_val_bio)[:, :-1] y_val = utils.get_one_hot(y_val, dynamic_args['num_classes']) print('Validation data: ', x_val.shape) print('Validation data Bio: ', x_val_bio.shape) x_test, x_test_bio, y_test = data['test']['src'], data['test'][ 'src_bio'], data['test']['tgt'] x_test = np.array(utils.pad_sequecnes(x_test, static_args.maxlen)) x_test_bio = np.array(x_test_bio)[:, :-1] y_test = utils.get_one_hot(y_test, dynamic_args['num_classes']) print('Test data: ', x_test.shape) print('Test data Bio: ', x_test_bio.shape) model = Models.DeepSol(static_args, dynamic_args).fetch_model_def() model.compile(loss='binary_crossentropy', optimizer=get_optimizer(), metrics=['accuracy']) print(model.summary()) # Training # Either use both bio and protein feats or just one of them acc_val, score_report_val, cm_val, pred_val = None, None, None, None if dynamic_args['biofeats'] is not None and dynamic_args[ 'protein_seq_feats'] is not None: model.fit([x_train, x_train_bio], y_train, batch_size=dynamic_args['batch_size'], class_weight="auto", epochs=int(static_args.epochs), validation_data=([x_val, x_val_bio], y_val), callbacks=get_callbacks()) best_model = utils.load_model(get_model_path()) [acc_val, score_report_val, cm_val, pred_val] = get_classification_performance(best_model, [x_val, x_val_bio], y_val) [acc_test, score_report_test, cm_test, pred_test] = get_classification_performance(best_model, [x_test, x_test_bio], y_test) elif dynamic_args['protein_seq_feats'] is not None: model.fit(x_train, y_train, batch_size=dynamic_args['batch_size'], class_weight="auto", epochs=int(static_args.epochs), validation_data=(x_val, y_val), callbacks=get_callbacks()) best_model = utils.load_model(get_model_path()) [acc_val, score_report_val, cm_val, pred_val] = get_classification_performance(best_model, x_val, y_val) [acc_test, score_report_test, cm_test, pred_test] = get_classification_performance(best_model, x_test, y_test) elif dynamic_args['biofeats'] is not None: model.fit(x_train_bio, y_train, batch_size=dynamic_args['batch_size'], class_weight="auto", epochs=int(static_args.epochs), validation_data=(x_val_bio, y_val), callbacks=get_callbacks()) best_model = utils.load_model(get_model_path()) [acc_val, score_report_val, cm_val, pred_val] = get_classification_performance(best_model, x_val_bio, y_val) [acc_test, score_report_test, cm_test, pred_test] = get_classification_performance(best_model, x_test_bio, y_test) results_filename_with_path = get_classification_performance_path() if os.path.exists(results_filename_with_path): os.remove(results_filename_with_path) # save on disk save_classification_performance('Validation Accuracy: ', str(acc_val)) save_classification_performance('Test Accuracy: ', str(acc_test)) save_classification_performance('Score Report Test: : ', str(score_report_test)) save_classification_performance('Confusion Matrix test: ', str(cm_test)) save_classification_prediction(pred_test)
def main(): data = load_data() #To perform 10-fold cross-validation kf = KFold(n_splits=10,shuffle=False) if (static_args.parameter_setting_id=="deepsol1"): x_train, y_train = data['train']['src'], data['train']['tgt'] dynamic_args['num_classes'] = len(set(y_train)) x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen)) y_train = utils.get_one_hot(y_train, dynamic_args['num_classes']) print('Training data: ', x_train.shape) x_val, y_val = data['valid']['src'], data['valid']['tgt'] x_val = np.array(utils.pad_sequecnes(x_val, static_args.maxlen)) y_val = utils.get_one_hot(y_val, dynamic_args['num_classes']) print('Valid data: ', x_val.shape) x_full = np.concatenate((x_train,x_val),axis=0) y_full = np.concatenate((y_train,y_val),axis=0) print('Full Train data: ', x_full.shape) else: x_train, x_train_bio, y_train = data['train']['src'], data['train']['src_bio'], data['train']['tgt'] dynamic_args['num_classes'] = len(set(y_train)) x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen)) x_train_bio = np.array(x_train_bio)[:, :-1] dynamic_args['num_bio_feats'] = int(x_train_bio.shape[1]) y_train = utils.get_one_hot(y_train, dynamic_args['num_classes']) print('Training data: ', x_train.shape) print('Training data Bio: ', x_train_bio.shape) x_val, x_val_bio, y_val = data['valid']['src'], data['valid']['src_bio'], data['valid']['tgt'] x_val = np.array(utils.pad_sequecnes(x_val, static_args.maxlen)) x_val_bio = np.array(x_val_bio)[:, :-1] y_val = utils.get_one_hot(y_val, dynamic_args['num_classes']) print('Valid data: ', x_val.shape) print('Valid data Bio: ', x_val_bio.shape) x_full = np.concatenate((x_train,x_val),axis=0) x_full_bio = np.concatenate((x_train_bio,x_val_bio),axis=0) y_full = np.concatenate((y_train,y_val),axis=0) print('Full Train data: ',x_full.shape) print('Full Train Bio: ',x_full_bio.shape) best_model = utils.load_model(get_model_path()) print('Loaded best model for ',static_args.parameter_setting_id) #Get file where results are to be saved results_filename_with_path = get_classification_performance_path() if os.path.exists(results_filename_with_path): os.remove(results_filename_with_path) #Keep average scores for cross_validation acc_test_vec, mcc_test_vec = [],[] count = 1 for train_index,test_index in kf.split(x_full): print('Starting CV Iteration: ',str(count)) x_test = x_full[test_index] y_test = y_full[test_index] y_test = np.array(y_test, dtype='int32') if (static_args.parameter_setting_id=='deepsol1'): [acc_test, score_report_test, cm_test, mcc_test, pred_test, pred_prob_test] = get_classification_performance(best_model,x_test,y_test) else: x_test_bio = x_full_bio[test_index] [acc_test, score_report_test, cm_test, mcc_test, pred_test, pred_prob_test] = get_classification_performance(best_model,[x_test,x_test_bio],y_test) #Save output on disk save_classification_performance('Iteration: '+str(count),'') save_classification_performance('Test Accuracy: ',str(acc_test)) save_classification_performance('Test MCC: ',str(mcc_test)) save_classification_performance('Score Report Test: : ',str(score_report_test)) save_classification_performance('Confusion Matrix test: ',str(cm_test)) acc_test_vec.append(acc_test) mcc_test_vec.append(mcc_test) count=count+1 mean_acc = (1.0*sum(acc_test_vec))/len(acc_test_vec) save_classification_performance('Mean CV accuracy: ',str(mean_acc)) mean_mcc = (1.0*sum(mcc_test_vec))/len(mcc_test_vec) save_classification_performance('Mean CV MCC: ',str(mean_mcc)) print('Finished cross-validation')