def main(unused_argv): """Main function that extracts the contineous data-driven descriptors for a file of SMILES.""" if FLAGS.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.device) model_dir = FLAGS.model_dir file = FLAGS.input df = read_input(file) if FLAGS.preprocess: print("start preprocessing SMILES...") df["new_smiles"] = df[FLAGS.smiles_header].map(preprocess_smiles) sml_list = df[~df.new_smiles.isna()].new_smiles.tolist() print("finished preprocessing SMILES!") else: sml_list = df[FLAGS.smiles_header].tolist() print("start calculating descriptors...") infer_model = InferenceModel(model_dir=model_dir, use_gpu=FLAGS.gpu, batch_size=FLAGS.batch_size, cpu_threads=FLAGS.cpu_threads) descriptors = infer_model.seq_to_emb(sml_list) print( "finished calculating descriptors! %d out of %d input SMILES could be interpreted" % (len(sml_list), len(df))) if FLAGS.preprocess: df = df.join( pd.DataFrame(descriptors, index=df[~df.new_smiles.isna()].index, columns=["cddd_" + str(i + 1) for i in range(512)])) else: df = df.join( pd.DataFrame(descriptors, index=df.index, columns=["cddd_" + str(i + 1) for i in range(512)])) print("writing descriptors to file...") df.to_csv(FLAGS.output)
def __init__(self, use_gpu=True, cpu_threads=5, model_dir=None, **kwargs): self.name = __class__.__name__.split('.')[-1] self.kwargs = TransformationDefaults[self.name].value self.kwargs.update(kwargs) model_dir = download_cddd_models() self.func = InferenceModel(model_dir, use_gpu=use_gpu, cpu_threads=cpu_threads)
def main(unused_argv): """Main function to test the performance of the translation model to extract meaningfull features for a QSAR modelling""" if FLAGS.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.device) print("use gpu {}".format(str(FLAGS.device))) else: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' model_dir = FLAGS.model_dir infer_model = InferenceModel(model_dir, use_gpu=FLAGS.gpu, cpu_threads=FLAGS.cpu_threads) ames_df = pd.read_csv("ames.csv") ames_smls = ames_df.smiles.tolist() ames_labels = ames_df.label.values ames_fold = ames_df.fold.values print("Extracting molecular desscriptors for Ames") ames_emb = infer_model.seq_to_emb(ames_smls) ames_emb = (ames_emb - ames_emb.mean()) / ames_emb.std() lipo_df = pd.read_csv("lipo.csv") lipo_smls = lipo_df.smiles.tolist() lipo_labels = lipo_df.label.values lipo_fold = lipo_df.fold.values print("Extracting molecular desscriptors for Lipophilicity") lipo_emb = infer_model.seq_to_emb(lipo_smls) lipo_emb = (lipo_emb - lipo_emb.mean()) / lipo_emb.std() print("Running SVM on Ames mutagenicity...") clf = SVC(C=5.0) result = cross_val_score(clf, ames_emb, ames_labels, ames_fold, cv=LeaveOneGroupOut(), n_jobs=5) print("Ames mutagenicity accuracy: %0.3f +/- %0.3f" % (np.mean(result), np.std(result))) print("Running SVM on Lipophilicity...") clf = SVR(C=5.0) result = cross_val_score(clf, lipo_emb, lipo_labels, lipo_fold, cv=LeaveOneGroupOut(), n_jobs=5) print("Lipophilicity r2: %0.3f +/- %0.3f" % (np.mean(result), np.std(result)))
def main(args): # Initialize molscore ms = MolScore(model_name='mso', task_config=args.molscore_config) ms.log_parameters(vars(args)) # Load inference model infer_model = InferenceModel(model_dir=args.infer_model) # Load init smiles assert os.path.exists(args.smiles_file) with open(args.smiles_file, 'r') as f: init_smiles = f.read().splitlines() # Initialize Optimizer opt = BasePSOptimizer.from_query(init_smiles=init_smiles, num_part=args.num_part, num_swarms=args.num_swarm, inference_model=infer_model, scoring_function=ms) # Run optimizer opt.run(args.num_steps) # Save everything ms.write_scores() ms.kill_dash_monitor()
class Embeddings(BaseTransformation): def __init__(self, use_gpu=True, cpu_threads=5, model_dir=None, **kwargs): self.name = __class__.__name__.split('.')[-1] self.kwargs = TransformationDefaults[self.name].value self.kwargs.update(kwargs) model_dir = download_cddd_models() self.func = InferenceModel(model_dir, use_gpu=use_gpu, cpu_threads=cpu_threads) def transform(self, data): data = data['transformed_smiles'] return self.func.seq_to_emb(data).squeeze() def inverse_transform(self, embeddings): "Embedding array -- individual compound embeddings are in rows" embeddings = np.asarray(embeddings) return self.func.emb_to_seq(embeddings) def __len__(self): return self.func.hparams.emb_size
def getemb_from_smi(smiles_nonzero_list, smiles_zero_list, nonzero_id): """ compute encoding of smiles Args: smiles_nonzero_list, smiles_zero_list: same as above Returns: x: smile embeddings y: fragment labels """ inference_model = InferenceModel() smiles_emb_nonzero = inference_model.seq_to_emb(smiles_nonzero_list) smiles_emb_zero = inference_model.seq_to_emb(smiles_zero_list) dfsmi_nonzero_eb = pd.DataFrame(smiles_emb_nonzero) dfsmi_zero_eb = pd.DataFrame(smiles_emb_zero) dfsmi_eb = pd.concat([dfsmi_nonzero_eb, dfsmi_zero_eb]) x = dfsmi_eb.to_numpy() count=x.shape[0] y = np.concatenate([np.ones(nonzero_id), np.zeros(count-nonzero_id)]) return x, y, dfsmi_eb
def optimize(chid, n_estimators, n_jobs, external_file, n_external, seed, opt_name, optimizer_args, log_base): """ Args: - chid: which assay to use - n_estimators: how many trees to use in Random Forest - n_jobs: how many parallel processes to use - external_file: Smiles that are not used for optimization - n_external: on how many such independent random points to calculate scores - seed: which random seed to use - opt_name: which optimizer to use (graph_ga or lstm_hc) - optimizer_args: dictionary with arguments for the optimizer - log_base: Where to store results. Will be appended by timestamp """ config = locals() # Results might not be fully reproducible when using pytorch # https://pytorch.org/docs/stable/notes/randomness.html np.random.seed(seed) torch.manual_seed(seed) # set up logging results_dir = os.path.join(log_base, opt_name, chid, timestamp()) os.makedirs(results_dir) config_file = os.path.join(results_dir, 'config.json') with open(config_file, 'w') as f: json.dump(config, f) clfs, aucs, balance, df1, df2 = fit_clfs(chid, n_estimators, n_jobs) results = {} results['AUC'] = aucs results['balance'] = balance clf_file = os.path.join(results_dir, 'classifiers.p') with open(clf_file, 'wb') as f: pickle.dump(clfs, f) df1.to_csv(os.path.join(results_dir, 'split1.csv'), index=False) df2.to_csv(os.path.join(results_dir, 'split2.csv'), index=False) # Create guacamol scoring function with clf trained on split 1 scoring_function = TPScoringFunction(clfs['Split1']) infer_model = InferenceModel(model_dir='default_model') # The CDDD inference model used to encode/decode molecular SMILES strings to/from the CDDD space. You might need to specify the path to the pretrained model (e.g. default_model) mso_score = [ScoringFunction(func=scoring_function.raw_score_list, name='score', is_mol_func=False, is_smiles_func=True)] # from mso.objectives.mol_functions import qed_score # mso_score = [ScoringFunction(func=qed_score, name="qed", is_mol_func=True)] # wrap the drug likeness score inside a scoring function instance class MsoWrapper(object): def __init__(self, smi_file, num_part, num_iter): self.smi_file = smi_file self.num_part = num_part self.num_iter = num_iter with open(self.smi_file) as f: self.start_pool = f.read().split() def run(self): init_smiles = list(np.random.choice(self.start_pool, self.num_part)) print(type(init_smiles)) opt = BasePSOptimizer.from_query( init_smiles=init_smiles, num_part=200, num_swarms=1, inference_model=infer_model, scoring_functions=mso_score) _, smiles_history = opt.run(self.num_iter) return smiles_history # run optimization t0 = time() if opt_name == 'graph_ga': optimizer = GB_GA_Generator(**optimizer_args) elif opt_name == 'lstm_hc': optimizer = SmilesRnnDirectedGenerator(**optimizer_args) elif opt_name == 'mso': optimizer = MsoWrapper(**optimizer_args) else: raise ValueError(f'Invalid optimizer: {opt_name}') if opt_name == 'mso': smiles_history = optimizer.run() else: smiles_history = optimizer.generate_optimized_molecules( scoring_function, 100, get_history=True) smiles_history = [can_list(e) for e in smiles_history] t1 = time() opt_time = t1 - t0 # make a list of dictionaries for every time step # this is far from an optimal data structure statistics = [] for optimized_smiles in smiles_history: row = {} row['smiles'] = optimized_smiles row['preds'] = {} for k, clf in clfs.items(): preds = score(optimized_smiles, clf) if None in preds: print('Invalid score. Debug message') row['preds'][k] = preds statistics.append(row) results['statistics'] = statistics stat_time = time() - t1 # add predictions on external set # load external smiles for evaluation with open(external_file) as f: external_smiles = f.read().split() external_smiles = np.random.choice(external_smiles, n_external) results['predictions_external'] = {k: score(external_smiles, clf) for k, clf in clfs.items()} results_file = os.path.join(results_dir, 'results.json') with open(results_file, 'w') as f: json.dump(results, f) print(f'Storing results in {results_dir}') print(f'Optimization time {opt_time:.2f}') print(f'Statistics time {stat_time:.2f}')
def cv(dataset, summary_df, cddd_model_dir, molbert_model_dir): df, indices = get_data(dataset) cddd = InferenceModel(cddd_model_dir) # type: ignore molbert = MolBertFeaturizer(molbert_model_dir, embedding_type='average-1-cat-pooled', max_seq_len=200, device='cpu') # type: ignore ecfp = MorganFPFeaturizer(fp_size=2048, radius=2, use_counts=True, use_features=False) rdkit_norm = PhysChemFeaturizer(normalise=True) cddd_fn = lambda smiles: cddd.seq_to_emb(smiles) molbert_fn = lambda smiles: molbert.transform(smiles)[0] ecfp_fn = lambda smiles: ecfp.transform(smiles)[0] rdkit_norm_fn = lambda smiles: rdkit_norm.transform(smiles)[0] for i, (train_idx, valid_idx, test_idx) in enumerate(indices): train_df = df.iloc[train_idx] valid_df = df.iloc[valid_idx] # combine train and valid set as SVMs don't use a validation set, but NNs do. # this way they use the same amount of data. train_df = pd.concat([train_df, valid_df]) test_df = df.iloc[test_idx] fn_combos = [('cddd', cddd_fn), ('molbert', molbert_fn), ('ECFP4', ecfp_fn), ('rdkit_norm', rdkit_norm_fn)] for feat_name, feat_fn in fn_combos: train_features = np.vstack([ feat_fn(batch) for batch in batchify(train_df['SMILES'], 256) ]) train_labels = train_df[df.columns[-1]] test_features = np.vstack( [feat_fn(batch) for batch in batchify(test_df['SMILES'], 256)]) test_labels = test_df[df.columns[-1]] mode = summary_df[summary_df['task_name'] == dataset].iloc[0]['task_type'].strip() np.random.seed(i) if mode == 'regression': model = SVR(C=5.0) elif mode == 'classification': model = SVC(5.0, probability=True) else: raise ValueError( f'Mode has to be either classification or regression but was {mode}.' ) model.fit(train_features, train_labels) predictions = model.predict(test_features) if mode == 'classification': # predict probabilities (needed for some metrics) and get probs of positive class ([:, 1]) prob_predictions = model.predict_proba(test_features)[:, 1] metrics_dict = { 'AUROC': lambda: metrics.roc_auc_score(test_labels, prob_predictions ), 'AveragePrecision': lambda: metrics.average_precision_score( test_labels, prob_predictions), 'Accuracy': lambda: metrics.accuracy_score(test_labels, predictions), } else: metrics_dict = { 'MAE': lambda: metrics.mean_absolute_error( test_labels, predictions), 'RMSE': lambda: np.sqrt( metrics.mean_squared_error(test_labels, predictions)), 'MSE': lambda: metrics.mean_squared_error(test_labels, predictions ), 'R2': lambda: metrics.r2_score(test_labels, predictions), } metric_values = {} for name, callable_metric in metrics_dict.items(): try: metric_values[name] = callable_metric() except Exception as e: print(f'unable to calculate {name} metric') print(e) metric_values[name] = np.nan default_path = os.path.join( './logs/', datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')) output_dir = os.path.join(default_path, dataset, str(i)) os.makedirs(output_dir, exist_ok=True) with open(os.path.join(output_dir, f'{feat_name}_metrics.json'), 'w+') as fp: json.dump(metric_values, fp)