def drug(): X_drug=[] X_target=[] X_drug.append(request.get_json()["molecule"]) X_target.append(request.get_json()["target"]) drug_encoding, target_encoding = 'Morgan', 'AAC' net = models.model_pretrained(model = 'Morgan_AAC_DAVIS') y=[0] X_pred = utils.data_process(X_drug, X_target, y, drug_encoding, target_encoding, split_method='no_split') y_pred = net.predict(X_pred) print('The predicted score is ' + str(y_pred)) return str(y_pred)
target_encoding = 'CNN' train, val, test = data_process(X_drugs, X_targets, y, drug_encoding, target_encoding, split_method='random', frac=[0.85, 0.1, 0.05]) config = generate_config( drug_encoding=drug_encoding, target_encoding=target_encoding, cls_hidden_dims=[1024, 1024, 512], train_epoch=100, LR=0.001, batch_size=32, hidden_dim_drug=128, mpnn_hidden_size=128, mpnn_depth=3, cnn_target_filters=[32, 64, 96], cnn_target_kernels=[4, 8, 12], result_folder= "/content/drive/MyDrive/Colab Notebooks/DeepPurpose_results/NPASS_MPNN_2") model = models.model_initialize(**config) t2 = time() print("cost about " + str(int(t2 - t1)) + " seconds") model.train(train, val, test) model.save_model('/content/drive/MyDrive/Colab Notebooks/models/NPASS_MPNN_2')
def virtual_screening(target, X_repurpose=None, target_name=None, drug_names=None, train_drug=None, train_target=None, train_y=None, save_dir='./save_folder', pretrained_dir=None, finetune_epochs=10, finetune_LR=0.01, finetune_batch_size=32, convert_y=True, subsample_frac=1, pretrained=True, split='random', frac=[0.7, 0.1, 0.2], agg='agg_mean_max', output_len=30): # print(X_repurpose) if not os.path.exists(save_dir): print( 'Save path not found or given and set to default: \'./save_folder/\'. ' ) os.mkdir(save_dir) save_dir = save_dir if target_name is None: target_name = [ 'Target ' + str(i) for i in list(range(len(X_repurpose))) ] if X_repurpose is not None: if drug_names is None: drug_names = [ 'Drug ' + str(i) for i in list(range(len(X_repurpose))) ] print("Loading customized repurposing dataset...") else: print( "Virtual Screening requires drug candidates input (a list of SMILESs)" ) pretrained_model_names = [['Daylight', 'AAC']] y_preds_models = [] if (pretrained_dir is None) & pretrained: # load 6 pretrained model print('Beginning Downloading Pretrained Model...') print( 'Note: if you have already download the pretrained model before, please stop the program and set the input parameter \'pretrained_dir\' to the path' ) pretrained_dir = download_pretrained_model('pretrained_models') elif pretrained == False: print( 'Beginning Downloading Configs Files for training from scratch...') pretrained_dir = download_pretrained_model('models_configs') else: print('Checking if pretrained directory is valid...') if not os.path.exists(pretrained_dir): print( 'The directory to pretrained model is not found. Please double check, or download it again by setting the input parameter \'pretrained_dir\' to be \'None\'' ) else: print('Beginning to load the pretrained models...') if train_drug is None: print('Using pretrained model and making predictions...') for idx, model_name in enumerate(pretrained_model_names): model_path = os.path.join( pretrained_dir, 'model_' + model_name[0] + '_' + model_name[1]) model = models.model_pretrained(model_path) result_folder_path = os.path.join( save_dir, 'results_' + model_name[0] + '_' + model_name[1]) if not os.path.exists(result_folder_path): os.mkdir(result_folder_path) y_pred = models.virtual_screening(X_repurpose, target, model, drug_names, target_name, convert_y=convert_y, result_folder=result_folder_path, verbose=False) y_preds_models.append(y_pred) print('Predictions from model ' + str(idx + 1) + ' with drug encoding ' + model_name[0] + ' and target encoding ' + model_name[1] + ' are done...') print('-------------') else: # customized training data print('Training on your own customized data...') if not os.path.exists(os.path.join(save_dir, 'new_trained_models')): os.mkdir(os.path.join(save_dir, 'new_trained_models')) new_trained_models_dir = os.path.join(save_dir, 'new_trained_models') if isinstance(train_target, str): train_target = [train_target] for idx, model_name in enumerate(pretrained_model_names): drug_encoding = model_name[0] target_encoding = model_name[1] train, val, test = data_process(train_drug, train_target, train_y, drug_encoding, target_encoding, split_method=split, frac=frac, sample_frac=subsample_frac) model_path = os.path.join( pretrained_dir, 'model_' + model_name[0] + '_' + model_name[1]) if pretrained: model = models.model_pretrained(model_path) print('Use pretrained model...') else: config = load_dict(model_path) model = models.model_initialize(**config) print('Training from scrtach...') print('Begin to train model ' + str(idx) + ' with drug encoding ' + drug_encoding + ' and target encoding ' + target_encoding) model.config['train_epoch'] = finetune_epochs model.config['LR'] = finetune_LR model.config['batch_size'] = finetune_batch_size result_folder_path = os.path.join( save_dir, 'results_' + model_name[0] + '_' + model_name[1]) if not os.path.exists(result_folder_path): os.mkdir(result_folder_path) model.config['result_folder'] = result_folder_path model.train(train, val, test) print('model training finished, now doing virtual screening') y_pred = models.virtual_screening(X_repurpose, target, model, drug_names, target_name, convert_y=convert_y, result_folder=result_folder_path, verbose=False) y_preds_models.append(y_pred) print('Predictions from model ' + str(idx) + ' with drug encoding ' + model_name[0] + ' and target encoding ' + model_name[1] + ' are done...') model.save_model( os.path.join(new_trained_models_dir, 'model_' + model_name[0] + '_' + model_name[1])) result_folder_path = os.path.join(save_dir, 'results_aggregation') if not os.path.exists(result_folder_path): os.mkdir(result_folder_path) print('models prediction finished...') print('aggregating results...') if agg == 'mean': y_pred = np.mean(y_preds_models, axis=0) elif agg == 'max_effect': if convert_y: y_pred = np.min(y_preds_models, axis=0) else: y_pred = np.max(y_preds_models, axis=0) elif agg == 'agg_mean_max': if convert_y: y_pred = (np.min(y_preds_models, axis=0) + np.mean(y_preds_models, axis=0)) / 2 else: y_pred = (np.max(y_preds_models, axis=0) + np.mean(y_preds_models, axis=0)) / 2 with open(os.path.join(result_folder_path, 'logits_VS_mean.pkl'), 'wb') as f: pickle.dump(np.mean(y_preds_models, axis=0), f, pickle.HIGHEST_PROTOCOL) with open(os.path.join(result_folder_path, 'logits_VS_max.pkl'), 'wb') as f: pickle.dump(np.min(y_preds_models, axis=0), f, pickle.HIGHEST_PROTOCOL) with open(os.path.join(result_folder_path, 'logits_VS_mean_max.pkl'), 'wb') as f: pickle.dump( (np.min(y_preds_models, axis=0) + np.mean(y_preds_models, axis=0)) / 2, f, pickle.HIGHEST_PROTOCOL) fo = os.path.join(result_folder_path, "virtual_screening.txt") print_list = [] if model.binary: table_header = [ "Rank", "Drug Name", "Target Name", "Interaction", "Probability" ] else: ### regression table_header = ["Rank", "Drug Name", "Target Name", "Binding Score"] table = PrettyTable(table_header) with open(fo, 'w') as fout: print('virtual screening...') df_data = data_process_repurpose_virtual_screening( X_repurpose, target, model.drug_encoding, model.target_encoding, 'virtual screening') y_pred = model.predict(df_data) if convert_y: y_pred = convert_y_unit(np.array(y_pred), 'p', 'nM') print('---------------') if drug_names is not None and target_name is not None: print('Virtual Screening Result') f_d = max([len(o) for o in drug_names]) + 1 f_p = max([len(o) for o in target_name]) + 1 for i in range(len(target)): if model.binary: if y_pred[i] > 0.5: string_lst = [ drug_names[i], target_name[i], "YES", "{0:.2f}".format(y_pred[i]) ] else: string_lst = [ drug_names[i], target_name[i], "NO", "{0:.2f}".format(y_pred[i]) ] else: ### regression string_lst = [ drug_names[i], target_name[i], "{0:.2f}".format(y_pred[i]) ] print_list.append((string_lst, y_pred[i])) if convert_y: print_list.sort(key=lambda x: x[1]) else: print_list.sort(key=lambda x: x[1], reverse=True) print_list = [i[0] for i in print_list] for idx, lst in enumerate(print_list): lst = [str(idx + 1)] + lst table.add_row(lst) fout.write(table.get_string()) with open(fo, 'r') as fin: lines = fin.readlines() for idx, line in enumerate(lines): if idx < output_len + 3: print(line, end='') else: print('checkout ' + fo + ' for the whole list') break print() with open(os.path.join(result_folder_path, 'output_list_VS.pkl'), 'wb') as f: pickle.dump(print_list, f, pickle.HIGHEST_PROTOCOL)
drug_encoding, target_encoding = 'MPNN', 'Transformer' # Data processing, here we select cold protein split setup. train, val, test = data_process(X_drug, X_target, y, drug_encoding, target_encoding, split_method='cold_protein', frac=[0.7, 0.1, 0.2]) # Generate new model using default parameters; also allow model tuning via input parameters. config = generate_config(drug_encoding, target_encoding, transformer_n_layer_target=8) net = models.model_initialize(**config) # Train the new model. # Detailed output including a tidy table storing validation loss, metrics, AUC curves figures and etc. are stored in the ./result folder. net.train(train, val, test) # or simply load pretrained model from a model directory path or reproduced model name such as DeepDTA net = models.model_pretrained(MODEL_PATH_DIR or MODEL_NAME) # Repurpose using the trained model or pre-trained model # In this example, loading repurposing dataset using Broad Repurposing Hub and SARS-CoV 3CL Protease Target. X_repurpose, drug_name, drug_cid = load_broad_repurposing_hub(SAVE_PATH) target, target_name = load_SARS_CoV_Protease_3CL() _ = models.repurpose(X_repurpose, target, net, drug_name, target_name)
import DeepPurpose.DTI as models from time import time t1 = time() import pandas as pd test_df_path = '/content/drive/MyDrive/Colab Notebooks/data/NPASS_kd_test_for_deeppurpose.csv' test_df = pd.read_csv(test_df_path, error_bad_lines=False, encoding="Latin-1") print(len(test_df)) smiles = list(test_df['SMILES']) target_sequence = list(test_df['Target_sequence']) labels = list(test_df['log_kd']) model = models.model_pretrained( path_dir= '/content/drive/MyDrive/Colab Notebooks/models/DeepPurpose_Kd_models/NPASS_tranformer_protein_1' ) print(model.config) drug_encoding = 'CNN' target_encoding = 'Transformer' X_pred = utils.data_process(smiles, target_sequence, labels, drug_encoding, target_encoding, split_method='no_split') y_pred = model.predict(X_pred)
def repurpose(target, target_name=None, X_repurpose=None, drug_names=None, train_drug=None, train_target=None, train_y=None, save_dir='./save_folder', pretrained_dir=None, finetune_epochs=10, finetune_LR=0.001, finetune_batch_size=32, convert_y=True, subsample_frac=1, pretrained=True, split='random', frac=[0.7, 0.1, 0.2], agg='agg_mean_max', output_len=30): if not os.path.exists(save_dir): print( 'Save path not found or given and set to default: \'./save_folder/\'. ' ) os.mkdir('save_folder') save_dir = './save_folder' if target_name is None: target_name = 'New Target' if X_repurpose is not None: if drug_names is None: drug_names = [ 'Drug ' + str(i) for i in list(range(len(X_repurpose))) ] print("Loading customized repurposing dataset...") else: if not os.path.exists(os.path.join(save_dir, 'data')): os.mkdir(os.path.join(save_dir, 'data')) data_path = os.path.join(save_dir, 'data') X_repurpose, _, drug_names = load_broad_repurposing_hub(data_path) # default repurposing hub dataset is broad repurposing hub pretrained_model_names = [['MPNN', 'CNN'], ['CNN', 'CNN'], ['Morgan', 'CNN'], ['Morgan', 'AAC'], ['Daylight', 'AAC']] y_preds_models = [] if (pretrained_dir is None) & pretrained: # load 6 pretrained model print('Beginning Downloading Pretrained Model...') print( 'Note: if you have already download the pretrained model before, please stop the program and set the input parameter \'pretrained_dir\' to the path' ) url = 'https://deeppurpose.s3.amazonaws.com/pretrained_models.zip' if not os.path.exists(os.path.join(save_dir, 'pretrained_models')): os.mkdir(os.path.join(save_dir, 'pretrained_models')) pretrained_dir = os.path.join(save_dir, 'pretrained_models') pretrained_dir_ = wget.download(url, pretrained_dir) print('Downloading finished... Beginning to extract zip file...') with ZipFile(pretrained_dir_, 'r') as zip: zip.extractall(path=pretrained_dir) print('Pretrained Models Successfully Downloaded...') pretrained_dir = os.path.join(pretrained_dir, 'DeepPurpose_BindingDB') elif pretrained == False: print( 'Beginning Downloading Configs Files for training from scratch...') url = 'https://deeppurpose.s3.amazonaws.com/models_configs.zip' if not os.path.exists(os.path.join(save_dir, 'models_configs')): os.mkdir(os.path.join(save_dir, 'models_configs')) pretrained_dir = os.path.join(save_dir, 'models_configs') pretrained_dir_ = wget.download(url, pretrained_dir) print('Downloading finished... Beginning to extract zip file...') with ZipFile(pretrained_dir_, 'r') as zip: zip.extractall(path=pretrained_dir) print('Configs Models Successfully Downloaded...') pretrained_dir = os.path.join(pretrained_dir, 'models_configs') else: print('Checking if pretrained directory is valid...') if not os.path.exists(pretrained_dir): print( 'The directory to pretrained model is not found. Please double check, or download it again by setting the input parameter \'pretrained_dir\' to be \'None\'' ) else: print('Beginning to load the pretrained models...') if train_drug is None: print('Using pretrained model and making predictions...') for idx, model_name in enumerate(pretrained_model_names): model_path = os.path.join( pretrained_dir, 'model_' + model_name[0] + '_' + model_name[1]) model = models.model_pretrained(model_path) result_folder_path = os.path.join( save_dir, 'results_' + model_name[0] + '_' + model_name[1]) if not os.path.exists(result_folder_path): os.mkdir(result_folder_path) y_pred = models.repurpose(X_repurpose, target, model, drug_names, target_name, convert_y=convert_y, result_folder=result_folder_path, verbose=False) y_preds_models.append(y_pred) print('Predictions from model ' + str(idx + 1) + ' with drug encoding ' + model_name[0] + ' and target encoding ' + model_name[1] + ' are done...') print('-------------') else: # customized training data print('Training on your own customized data...') if not os.path.exists(os.path.join(save_dir, 'new_trained_models')): os.mkdir(os.path.join(save_dir, 'new_trained_models')) new_trained_models_dir = os.path.join(save_dir, 'new_trained_models') if isinstance(train_target, str): train_target = [train_target] for idx, model_name in enumerate(pretrained_model_names): drug_encoding = model_name[0] target_encoding = model_name[1] train, val, test = data_process(train_drug, train_target, train_y, drug_encoding, target_encoding, split_method=split, frac=frac, sample_frac=subsample_frac) model_path = os.path.join( pretrained_dir, 'model_' + model_name[0] + '_' + model_name[1]) if pretrained: model = models.model_pretrained(model_path) print('Use pretrained model...') else: config = load_dict(model_path) model = models.model_initialize(**config) print('Training from scrtach...') print('Begin to train model ' + str(idx) + ' with drug encoding ' + drug_encoding + ' and target encoding ' + target_encoding) model.config['train_epoch'] = finetune_epochs model.config['LR'] = finetune_LR model.config['batch_size'] = finetune_batch_size result_folder_path = os.path.join( save_dir, 'results_' + model_name[0] + '_' + model_name[1]) if not os.path.exists(result_folder_path): os.mkdir(result_folder_path) model.config['result_folder'] = result_folder_path model.train(train, val, test) print('model training finished, now repurposing') y_pred = models.repurpose(X_repurpose, target, model, drug_names, target_name, convert_y=convert_y, result_folder=result_folder_path, verbose=False) y_preds_models.append(y_pred) print('Predictions from model ' + str(idx) + ' with drug encoding ' + model_name[0] + ' and target encoding ' + model_name[1] + ' are done...') model.save_model( os.path.join(new_trained_models_dir, 'model_' + model_name[0] + '_' + model_name[1])) result_folder_path = os.path.join(save_dir, 'results_aggregation') if not os.path.exists(result_folder_path): os.mkdir(result_folder_path) print('models prediction finished...') print('aggregating results...') if agg == 'mean': y_pred = np.mean(y_preds_models, axis=0) elif agg == 'max_effect': if convert_y: y_pred = np.min(y_preds_models, axis=0) else: y_pred = np.max(y_preds_models, axis=0) elif agg == 'agg_mean_max': if convert_y: y_pred = (np.min(y_preds_models, axis=0) + np.mean(y_preds_models, axis=0)) / 2 else: y_pred = (np.max(y_preds_models, axis=0) + np.mean(y_preds_models, axis=0)) / 2 fo = os.path.join(result_folder_path, "repurposing.txt") print_list = [] with open(fo, 'w') as fout: print('---------------') if target_name is not None: print('Drug Repurposing Result for ' + target_name) if model.binary: table_header = [ "Rank", "Drug Name", "Target Name", "Interaction", "Probability" ] else: ### regression table_header = [ "Rank", "Drug Name", "Target Name", "Binding Score" ] table = PrettyTable(table_header) if drug_names is not None: f_d = max([len(o) for o in drug_names]) + 1 for i in range(len(y_pred)): if model.binary: if y_pred[i] > 0.5: string_lst = [ drug_names[i], target_name, "YES", "{0:.2f}".format(y_pred[i]) ] else: string_lst = [ drug_names[i], target_name, "NO", "{0:.2f}".format(y_pred[i]) ] else: #### regression #### Rank, Drug Name, Target Name, binding score string_lst = [ drug_names[i], target_name, "{0:.2f}".format(y_pred[i]) ] string = 'Drug ' + '{:<{f_d}}'.format(drug_names[i], f_d =f_d) + \ ' predicted to have binding affinity score ' + "{0:.2f}".format(y_pred[i]) #print_list.append((string, y_pred[i])) print_list.append((string_lst, y_pred[i])) if convert_y: print_list.sort(key=lambda x: x[1]) else: print_list.sort(key=lambda x: x[1], reverse=True) print_list = [i[0] for i in print_list] for idx, lst in enumerate(print_list): lst = [str(idx + 1)] + lst table.add_row(lst) fout.write(table.get_string()) with open(fo, 'r') as fin: lines = fin.readlines() for idx, line in enumerate(lines): if idx < output_len + 3: print(line, end='') else: print('checkout ' + fo + ' for the whole list') break print() with open(os.path.join(result_folder_path, 'output_list.pkl'), 'wb') as f: pickle.dump(print_list, f, pickle.HIGHEST_PROTOCOL)