def find_similar_mols_from_file(test_path: str, train_path: str, distance_measure: str, checkpoint_path: str = None, num_neighbors: int = -1, batch_size: int = 50) -> List[OrderedDict]: """ For each test molecule, finds the N most similar training molecules according to some distance measure. Loads molecules and model from file. :param test_path: Path to a CSV file containing test SMILES. :param train_path: Path to a CSV file containing train SMILES. :param checkpoint_path: Path to a .pt model checkpoint file (only needed for distance_measure == 'embedding'). :param distance_measure: The distance measure to use to determine nearest neighbors. :param num_neighbors: The number of nearest training molecules to find for each test molecule. :param batch_size: Batch size. :return: A list of OrderedDicts containing the test smiles, the num_neighbors nearest training smiles, and other relevant distance info. """ print('Loading data') test_smiles, train_smiles = get_smiles(test_path), get_smiles(train_path) if checkpoint_path is not None: print('Loading model') model = load_checkpoint(checkpoint_path) else: model = None return find_similar_mols(test_smiles=test_smiles, train_smiles=train_smiles, distance_measure=distance_measure, model=model, num_neighbors=num_neighbors, batch_size=batch_size)
def run_split_data(args: Args): # Load raw data with open(args.data_path) as f: reader = csv.reader(f) header = next(reader) lines = list(reader) # Load SMILES smiles = get_smiles(path=args.data_path, smiles_column=args.smiles_column) # Make sure lines and smiles line up assert len(lines) == len(smiles) assert all(smile in line for smile, line in zip(smiles, lines)) # Create data data = [] for smile, line in tqdm(zip(smiles, lines), total=len(smiles)): datapoint = MoleculeDatapoint(smiles=smile) datapoint.line = line data.append(datapoint) data = MoleculeDataset(data) train, val, test = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed) makedirs(args.save_dir) for name, dataset in [('train', train), ('val', val), ('test', test)]: with open(os.path.join(args.save_dir, f'{name}.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for datapoint in dataset: writer.writerow(datapoint.line)
def interpret(args: InterpretArgs) -> None: global C_PUCT, MIN_ATOMS chemprop_model = ChempropModel(args) def scoring_function(smiles: List[str]) -> List[float]: return chemprop_model(smiles)[:, args.property_id - 1] C_PUCT = args.c_puct MIN_ATOMS = args.min_atoms all_smiles = get_smiles(path=args.data_path, smiles_column=args.smiles_column) header = get_header(path=args.data_path) property_name = header[ args.property_id] if len(header) > args.property_id else 'score' print(f'smiles,{property_name},rationale,rationale_score') rat_smiles = [] rat_scores = [] for smiles in all_smiles: score = scoring_function([smiles])[0] if score > args.prop_delta: rationales = mcts(smiles=smiles, scoring_function=scoring_function, n_rollout=args.rollout, max_atoms=args.max_atoms, prop_delta=args.prop_delta) else: rationales = [] if len(rationales) == 0: rat_smiles.append('N/A') rat_scores.append(0) print(f'{smiles},{score:.3f},,') else: min_size = min(len(x.atoms) for x in rationales) min_rationales = [ x for x in rationales if len(x.atoms) == min_size ] rats = sorted(min_rationales, key=lambda x: x.P, reverse=True) rat_smiles.append(rats[0].smiles) rat_scores.append(rats[0].P) print(f'{smiles},{score:.3f},{rats[0].smiles},{rats[0].P:.3f}') return pd.DataFrame( list(zip(all_smiles, rat_smiles, rat_scores)), columns=['smiles', 'rationale_smiles', 'rationale_score'])
def get_pred_smiles( pred_smiles_dir: str, return_num_decode: bool = False ) -> Union[List[str], Tuple[List[str], int]]: # Collect all paths to .txt files pred_paths = [] for root, _, files in os.walk(pred_smiles_dir): pred_paths += [ os.path.join(root, fname) for fname in files if fname.endswith('.txt') ] # Get SMILES all_smiles = [] for path in pred_paths: all_smiles.append(get_smiles(path, header=False)) # Check that each list of smiles loaded from a file has the same number of smiles sizes = {len(smiles) for smiles in all_smiles} assert len(sizes) == 1 num_examples = sizes.pop() num_decode = len(all_smiles) # Reorder smiles so that all translations of a source molecule are contiguous smiles = [ all_smiles[j][i] for i in range(num_examples) for j in range(num_decode) ] # Filter out invalid smiles num_tot_smiles = sum( 1 for smile in smiles if smile is not None) # Shouldn't be any Nones but just to be safe smiles = replace_invalid_smiles(smiles) num_valid_smiles = sum(1 for smile in smiles if smile is not None) print( f'Valid smiles = {100 * num_valid_smiles / num_tot_smiles}% ({num_valid_smiles}/{num_tot_smiles})' ) # Optionally return the number of decodings for each source molecule if return_num_decode: return smiles, num_decode return smiles
def predict(): """Renders the predict page and makes predictions if the method is POST.""" if request.method == 'GET': return render_predict() # Get arguments ckpt_id = request.form['checkpointName'] if request.form['textSmiles'] != '': smiles = request.form['textSmiles'].split() elif request.form['drawSmiles'] != '': smiles = [request.form['drawSmiles']] else: # Upload data file with SMILES data = request.files['data'] data_name = secure_filename(data.filename) data_path = os.path.join(app.config['TEMP_FOLDER'], data_name) data.save(data_path) # Check if header is smiles possible_smiles = get_header(data_path)[0] smiles = [possible_smiles ] if Chem.MolFromSmiles(possible_smiles) is not None else [] # Get remaining smiles smiles.extend(get_smiles(data_path)) models = db.get_models(ckpt_id) model_paths = [ os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt') for model in models ] task_names = load_task_names(model_paths[0]) num_tasks = len(task_names) gpu = request.form.get('gpu') train_args = load_args(model_paths[0]) # Build arguments arguments = [ '--test_path', 'None', '--preds_path', os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME']), '--checkpoint_paths', *model_paths ] if gpu is not None: if gpu == 'None': arguments.append('--no_cuda') else: arguments += ['--gpu', gpu] # Handle additional features if train_args.features_path is not None: # TODO: make it possible to specify the features generator if trained using features_path arguments += [ '--features_generator', 'rdkit_2d_normalized', '--no_features_scaling' ] elif train_args.features_generator is not None: arguments += ['--features_generator', *train_args.features_generator] if not train_args.features_scaling: arguments.append('--no_features_scaling') # Parse arguments args = PredictArgs().parse_args(arguments) # Run predictions preds = make_predictions(args=args, smiles=smiles) if all(p is None for p in preds): return render_predict(errors=['All SMILES are invalid']) # Replace invalid smiles with message invalid_smiles_warning = 'Invalid SMILES String' preds = [ pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds ] return render_predict( predicted=True, smiles=smiles, num_smiles=min(10, len(smiles)), show_more=max(0, len(smiles) - 10), task_names=task_names, num_tasks=len(task_names), preds=preds, warnings=["List contains invalid SMILES strings"] if None in preds else None, errors=["No SMILES strings given"] if len(preds) == 0 else None)
def predict(): """Renders the predict page and makes predictions if the method is POST.""" if request.method == 'GET': return render_predict() # Get arguments ckpt_id = request.form['checkpointName'] if request.form['textSmiles'] != '': smiles = request.form['textSmiles'].split() elif request.form['drawSmiles'] != '': smiles = [request.form['drawSmiles']] else: print(" GOT HERE") # Upload data file with SMILES data = request.files['data'] data_name = secure_filename(data.filename) data_path = os.path.join(app.config['TEMP_FOLDER'], data_name) data.save(data_path) # Check if header is smiles possible_smiles = get_header(data_path)[0] smiles = [possible_smiles] if Chem.MolFromSmiles(possible_smiles) is not None else [] # Get remaining smiles smiles.extend(get_smiles(data_path)) models = db.get_models(ckpt_id) model_paths = [os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt') for model in models] task_names = load_task_names(model_paths[0]) num_tasks = len(task_names) gpu = request.form.get('gpu') # Create and modify args args = load_args(model_paths[0]) if args.features_path != None: args.features_generator = ["rdkit_2d_normalized"] args.features_path = None preds_path = os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME']) args.test_path = 'None' # TODO: Remove this hack to avoid assert crashing in modify_predict_args args.preds_path = preds_path args.checkpoint_paths = model_paths if gpu is not None: if gpu == 'None': args.no_cuda = True else: args.gpu = int(gpu) modify_predict_args(args) # Run predictions preds = make_predictions(args, smiles=smiles) if all(p is None for p in preds): return render_predict(errors=['All SMILES are invalid']) # Replace invalid smiles with message invalid_smiles_warning = "Invalid SMILES String" preds = [pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds] return render_predict(predicted=True, smiles=smiles, num_smiles=min(10, len(smiles)), show_more=max(0, len(smiles)-10), task_names=task_names, num_tasks=len(task_names), preds=preds, warnings=["List contains invalid SMILES strings"] if None in preds else None, errors=["No SMILES strings given"] if len(preds) == 0 else None)
return rationales if __name__ == "__main__": args = Args().parse_args() chemprop_model = ChempropModel(checkpoint_dir=args.checkpoint_dir, device=args.device) def scoring_function(smiles: List[str]) -> List[float]: return chemprop_model(smiles)[:, args.property_id - 1] C_PUCT = args.c_puct MIN_ATOMS = args.min_atoms all_smiles = get_smiles(path=args.data_path) header = get_header(path=args.data_path) property_name = header[ args.property_id] if len(header) > args.property_id else 'score' print(f'smiles,{property_name},rationale,rationale_score') for smiles in all_smiles: score = scoring_function([smiles])[0] if score > args.prop_delta: rationales = mcts(smiles=smiles, scoring_function=scoring_function, n_rollout=args.rollout, max_atoms=args.max_atoms, prop_delta=args.prop_delta) else:
def compare_datasets_tsne(args: Args): if len(args.smiles_paths) > len(args.colors): raise ValueError( 'Must have at least as many colors and sizes as datasets') # Random seed for random subsampling np.random.seed(0) # Genenrate labels based on file name labels = [ os.path.basename(path).replace('.csv', '') for path in args.smiles_paths ] # Load the smiles datasets print('Loading data') smiles, slices = [], [] for smiles_path, color, label in zip(args.smiles_paths, args.colors, labels): # Get SMILES new_smiles = get_smiles(path=smiles_path, smiles_column=args.smiles_column) print(f'{label}: {len(new_smiles):,}') # Subsample if dataset is too large if len(new_smiles) > args.max_per_dataset: print(f'Subsampling to {args.max_per_dataset:,} molecules') new_smiles = np.random.choice(new_smiles, size=args.max_per_dataset, replace=False).tolist() slices.append(slice(len(smiles), len(smiles) + len(new_smiles))) smiles += new_smiles # Compute Morgan fingerprints print('Computing Morgan fingerprints') morgan_generator = get_features_generator('morgan') morgans = [ morgan_generator(smile) for smile in tqdm(smiles, total=len(smiles)) ] print('Running t-SNE') start = time.time() tsne = TSNE(n_components=2, init='pca', random_state=0, metric='jaccard') X = tsne.fit_transform(morgans) print(f'time = {time.time() - start:.2f} seconds') print('Plotting t-SNE') x_min, x_max = np.min(X, axis=0), np.max(X, axis=0) X = (X - x_min) / (x_max - x_min) makedirs(args.save_path, isfile=True) plt.clf() fontsize = 50 * args.scale fig = plt.figure(figsize=(64 * args.scale, 48 * args.scale)) plt.title('t-SNE using Morgan fingerprint with Jaccard similarity', fontsize=2 * fontsize) ax = fig.gca() handles = [] legend_kwargs = dict(loc='upper right', fontsize=fontsize) for slc, color, label, size in zip(slices, args.colors, labels, args.sizes): if args.plot_molecules: # Plots molecules handles.append(mpatches.Patch(color=color, label=label)) for smile, (x, y) in zip(smiles[slc], X[slc]): img = Draw.MolsToGridImage([Chem.MolFromSmiles(smile)], molsPerRow=1, subImgSize=(200, 200)) imagebox = offsetbox.AnnotationBbox( offsetbox.OffsetImage(img), (x, y), bboxprops=dict(color=color)) ax.add_artist(imagebox) else: # Plots points plt.scatter(X[slc, 0], X[slc, 1], s=150 * size, color=color, label=label) if args.plot_molecules: legend_kwargs['handles'] = handles plt.legend(**legend_kwargs) plt.xticks([]), plt.yticks([]) print('Saving t-SNE') plt.savefig(args.save_path)
return rationales if __name__ == "__main__": args = InterpretArgs().parse_args() chemprop_model = ChempropModel(args) def scoring_function(smiles: List[str]) -> List[float]: return chemprop_model(smiles)[:, args.property_id - 1] C_PUCT = args.c_puct MIN_ATOMS = args.min_atoms all_smiles = get_smiles(path=args.data_path, smiles_column=args.smiles_column) header = get_header(path=args.data_path) property_name = header[args.property_id] if len(header) > args.property_id else 'score' print(f'smiles,{property_name},rationale,rationale_score') for smiles in all_smiles: score = scoring_function([smiles])[0] if score > args.prop_delta: rationales = mcts( smiles=smiles, scoring_function=scoring_function, n_rollout=args.rollout, max_atoms=args.max_atoms, prop_delta=args.prop_delta )
def predict(): if request.method == 'GET': return render_predict() # Get arguments checkpoint_name = request.form['checkpointName'] if 'data' in request.files: # Upload data file with SMILES data = request.files['data'] data_name = secure_filename(data.filename) data_path = os.path.join(app.config['TEMP_FOLDER'], data_name) data.save(data_path) # Check if header is smiles possible_smiles = get_header(data_path)[0] smiles = [possible_smiles ] if Chem.MolFromSmiles(possible_smiles) is not None else [] # Get remaining smiles smiles.extend(get_smiles(data_path)) elif request.form['textSmiles'] != '': smiles = request.form['textSmiles'].split() else: smiles = [request.form['drawSmiles']] checkpoint_path = os.path.join(app.config['CHECKPOINT_FOLDER'], checkpoint_name) task_names = load_task_names(checkpoint_path) num_tasks = len(task_names) gpu = request.form.get('gpu') # Create and modify args parser = ArgumentParser() add_predict_args(parser) args = parser.parse_args() preds_path = os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME']) args.test_path = 'None' # TODO: Remove this hack to avoid assert crashing in modify_predict_args args.preds_path = preds_path args.checkpoint_path = checkpoint_path args.write_smiles = True if gpu is not None: if gpu == 'None': args.no_cuda = True else: args.gpu = int(gpu) modify_predict_args(args) # Run predictions preds = make_predictions(args, smiles=smiles) if all(p is None for p in preds): return render_predict(errors=['All SMILES are invalid']) # Replace invalid smiles with message invalid_smiles_warning = "Invalid SMILES String" preds = [ pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds ] return render_predict( predicted=True, smiles=smiles, num_smiles=min(10, len(smiles)), show_more=max(0, len(smiles) - 10), task_names=task_names, num_tasks=len(task_names), preds=preds, warnings=["List contains invalid SMILES strings"] if None in preds else None, errors=["No SMILES strings given"] if len(preds) == 0 else None)
def generate_and_save_features(args: Args): """ Computes and saves features for a dataset of molecules as a 2D array in a .npz file. :param args: Arguments. """ # Create directory for save_path makedirs(args.save_path, isfile=True) # Get data and features function smiles = get_smiles(path=args.data_path, smiles_column=args.smiles_column) features_generator = get_features_generator(args.features_generator) temp_save_dir = args.save_path + '_temp' # Load partially complete data if args.restart: if os.path.exists(args.save_path): os.remove(args.save_path) if os.path.exists(temp_save_dir): shutil.rmtree(temp_save_dir) else: if os.path.exists(args.save_path): raise ValueError( f'"{args.save_path}" already exists and args.restart is False.' ) if os.path.exists(temp_save_dir): features, temp_num = load_temp(temp_save_dir) if not os.path.exists(temp_save_dir): makedirs(temp_save_dir) features, temp_num = [], 0 # Build features map function smiles = smiles[len( features ):] # restrict to data for which features have not been computed yet if args.sequential: features_map = map(features_generator, smiles) else: features_map = Pool().imap(features_generator, smiles) # Get features temp_features = [] for i, feats in tqdm(enumerate(features_map), total=len(smiles)): temp_features.append(feats) # Save temporary features every save_frequency if (i > 0 and (i + 1) % args.save_frequency == 0) or i == len(smiles) - 1: save_features(os.path.join(temp_save_dir, f'{temp_num}.npz'), temp_features) features.extend(temp_features) temp_features = [] temp_num += 1 try: # Save all features save_features(args.save_path, features) # Remove temporary features shutil.rmtree(temp_save_dir) except OverflowError: print( 'Features array is too large to save as a single file. Instead keeping features as a directory of files.' )
def evaluate(pred_smiles_dir: str, train_path: str, val_path: str, checkpoint_dir: Optional[str], computed_prop: Optional[str], prop_min: float, sim_thresholds: List[float], chemprop_predictor: ChempropPredictor = None, prop_max: float = None, unconditional: bool = False): if unconditional: evaluate_unconditional(pred_smiles_dir, train_path, val_path, checkpoint_dir, computed_prop, prop_min, sim_thresholds, chemprop_predictor, prop_max) return # Get smiles pred_smiles, num_decode = get_pred_smiles(pred_smiles_dir, return_num_decode=True) train_smiles = get_train_smiles(train_path) source_smiles = get_smiles(val_path, header=False) assert len(source_smiles) * num_decode == len(pred_smiles) # Get unique pred smiles unique_pred_smiles = set(pred_smiles) - {None} unique_pred_smiles = list(unique_pred_smiles) num_unique_pred_smiles = len(unique_pred_smiles) print( 'Computing tanimoto similarities from pred molecule to source molecule' ) tanimoto_similarities = {} for i, pred_smile in tqdm(enumerate(pred_smiles), total=len(pred_smiles)): source_smile = source_smiles[i // num_decode] tanimoto_similarities[( source_smile, pred_smile)] = tanimoto_similarity( source_smile, pred_smile) if pred_smile is not None else None print('Computing property values') if chemprop_predictor is not None: property_predictions = chemprop_predictor(unique_pred_smiles) else: property_predictions = predict_properties(unique_pred_smiles, checkpoint_dir, computed_prop) property_values = { pred_smile: property_prediction for pred_smile, property_prediction in zip(unique_pred_smiles, property_predictions) } print('Filtering by property values') pred_smiles = [pred_smile if pred_smile is not None and property_values[pred_smile] is not None and property_values[pred_smile] >= prop_min \ and (prop_max is None or property_values[pred_smile] <= prop_max) else None for pred_smile in pred_smiles] # Get unique pred smiles filtered_num_unique_pred_smiles = len(set(pred_smiles) - {None}) if filtered_num_unique_pred_smiles == 0: print( f'Valid molecules above prop threshold = 0% (0/{num_unique_pred_smiles})' ) print('Cannot compute any other metrics with 0 molecules') return print( f'Valid molecules above prop threshold = ' f'{100 * filtered_num_unique_pred_smiles / num_unique_pred_smiles:.2f}% ' f'({filtered_num_unique_pred_smiles}/{num_unique_pred_smiles})') num_unique_pred_smiles = filtered_num_unique_pred_smiles for sim_threshold in sim_thresholds: print( f'Minimum tanimoto similarity to source molecule allowed = {sim_threshold}' ) # Filtering by tanimoto similarity filtered_pred_smiles = [] for i, pred_smile in enumerate(pred_smiles): source_smile = source_smiles[i // num_decode] filtered_pred_smiles.append(pred_smile if pred_smile is not None \ and tanimoto_similarities[(source_smile, pred_smile)] is not None \ and tanimoto_similarities[(source_smile, pred_smile)] >= sim_threshold else None) num_unique_filtered_pred_smiles = len( set(filtered_pred_smiles) - {None}) print( f'Percent of unique pred smiles after filtering by tanimoto = ' f'{100 * num_unique_filtered_pred_smiles / num_unique_pred_smiles:.2f}% ' f'({num_unique_filtered_pred_smiles}/{num_unique_pred_smiles})') if num_unique_filtered_pred_smiles == 0: print('No molecules remaining, skipping') continue # Evaluate succeeded_val, tot_val = 0, 0 for i in range(0, len(filtered_pred_smiles), num_decode): decoded_smiles = filtered_pred_smiles[i:i + num_decode] if any(s is not None for s in decoded_smiles): succeeded_val += 1 tot_val += 1 paper_success = succeeded_val / tot_val diversity_mean, diversity_std = diversity_score( filtered_pred_smiles, num_decode) novelty = novelty_score(filtered_pred_smiles, train_smiles) filtered_properties = [ property_values[pred_smile] for pred_smile in filtered_pred_smiles if pred_smile is not None ] property_mean, property_std = np.mean(filtered_properties), np.std( filtered_properties) print(f'Success = {paper_success}') print(f'Diversity = {diversity_mean} +/- {diversity_std}') print(f'Novelty = {novelty}') print(f'Property = {property_mean} +/- {property_std}')