def load_gcnn_model(): # global rlm_gcnn_scaler, rlm_gcnn_features_scaler # gcnn_scaler, gcnn_features_scaler = load_scalers('./models/gcnn_model.pt') print(f'Loading RLM graph convolutional neural network model', file=sys.stdout) rlm_gcnn_scaler_path = f'{rlm_base_models_path}/gcnn_model.pt' if path.exists(rlm_gcnn_scaler_path): rlm_gcnn_scaler, _ = load_scalers(rlm_gcnn_scaler_path) else: rlm_gcnn_scaler_url = f'{base_url}/gcnn_model.pt' rlm_gcnn_scaler_request = requests.get(f'{base_url}/gcnn_model.pt') with tqdm.wrapattr(open(os.devnull, "wb"), "write", miniters=1, desc=rlm_gcnn_scaler_url.split('/')[-1], total=int( rlm_gcnn_scaler_request.headers.get( 'content-length', 0))) as fout: for chunk in rlm_gcnn_scaler_request.iter_content(chunk_size=4096): fout.write(chunk) with open(rlm_gcnn_scaler_path, 'wb') as rlm_gcnn_scaler_file: rlm_gcnn_scaler_file.write(rlm_gcnn_scaler_request.content) rlm_gcnn_scaler, _ = load_scalers(rlm_gcnn_scaler_path) rlm_gcnn_model = load_checkpoint(rlm_gcnn_scaler_path) return rlm_gcnn_scaler, rlm_gcnn_model
def load_model(args: PredictArgs, generator: bool = False): """ Function to load a model or ensemble of models from file. If generator is True, a generator of the respective model and scaler objects is returned (memory efficient), else the full list (holding all models in memory, necessary for preloading). :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param generator: A boolean to return a generator instead of a list of models and scalers. :return: A tuple of updated prediction arguments, training arguments, a list or generator object of models, a list or generator object of scalers, the number of tasks and their respective names. """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names update_prediction_args(predict_args=args, train_args=train_args) args: Union[PredictArgs, TrainArgs] # Load model and scalers models = (load_checkpoint(checkpoint_path, device=args.device) for checkpoint_path in args.checkpoint_paths) scalers = (load_scalers(checkpoint_path) for checkpoint_path in args.checkpoint_paths) if not generator: models = list(models) scalers = list(scalers) return args, train_args, models, scalers, num_tasks, task_names
def predict_smile(checkpoint_path: str, smile: str): smiles = [smile] """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ args = Namespace() # print('Loading training args') scaler, features_scaler = load_scalers(checkpoint_path) train_args = load_args(checkpoint_path) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) # print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) else: print("Enter Valid Smile String") return # print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict(model=model, data=test_data, batch_size=1, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions return sum_preds[0][0]
def __init__(self, checkpoint_dir): self.checkpoints = [] for root, _, files in os.walk(checkpoint_dir): for fname in files: if fname.endswith('.pt'): fname = os.path.join(root, fname) self.scaler, self.features_scaler = load_scalers(fname) self.train_args = load_args(fname) model = load_checkpoint(fname, cuda=True) self.checkpoints.append(model)
def load_gcnn_model(model_file_path, model_file_url): if path.exists(model_file_path): gcnn_scaler, _ = load_scalers(model_file_path) else: gcnn_scaler_request = requests.get(model_file_url) with tqdm.wrapattr( open(os.devnull, "wb"), "write", miniters=1, desc=model_file_url.split('/')[-1], total=int(gcnn_scaler_request.headers.get('content-length', 0))) as fout: for chunk in gcnn_scaler_request.iter_content(chunk_size=4096): fout.write(chunk) with open(model_file_path, 'wb') as gcnn_scaler_file: gcnn_scaler_file.write(gcnn_scaler_request.content) gcnn_scaler, _ = load_scalers(model_file_path) gcnn_model = load_checkpoint(model_file_path) return gcnn_scaler, gcnn_model
def __init__(self, checkpoint_dir): self.features_generator = ['rdkit_2d_normalized'] self.checkpoints, self.scalers, self.features_scalers = [], [], [] for root, _, files in os.walk(checkpoint_dir): for fname in files: if fname.endswith('.pt'): fname = os.path.join(root, fname) scaler, features_scaler = load_scalers(fname) self.scalers.append(scaler) self.features_scalers.append(features_scaler) model = load_checkpoint(fname) self.checkpoints.append(model)
def __init__(self, args: InterpretArgs) -> None: self.args = args self.train_args = load_args(args.checkpoint_paths[0]) # If features were used during training, they must be used when predicting if ((self.train_args.features_path is not None or self.train_args.features_generator is not None) and args.features_generator is None): raise ValueError('Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with --features_generator <generator> ' 'and using --no_features_scaling if applicable).') self.scaler, self.features_scaler = load_scalers(args.checkpoint_paths[0]) self.checkpoints = [load_checkpoint(checkpoint_path, device=args.device) for checkpoint_path in args.checkpoint_paths]
def __init__(self, checkpoint_paths: List[str], device: torch.device) -> None: self.train_args = load_args(checkpoint_paths[0]) if self.train_args.features_path is not None and self.train_args.features_generator is None: raise ValueError( 'Must specify features generator using --features_generator <generator> ' 'when using a model trained with additional features.') self.scaler, self.features_scaler = load_scalers(checkpoint_paths[0]) self.checkpoints = [ load_checkpoint(checkpoint_path, device=device) for checkpoint_path in checkpoint_paths ]
def __init__(self, checkpoint_dir, features_generator=None): self.features_generator = features_generator self.checkpoints, self.scalers, self.features_scalers = [], [], [] for root, _, files in os.walk(checkpoint_dir): for fname in files: if fname.endswith('.pt'): fname = os.path.join(root, fname) scaler, features_scaler, _, _ = load_scalers(fname) self.scalers.append(scaler) self.features_scalers.append(features_scaler) model = load_checkpoint(fname, device=torch.device('cpu')) self.checkpoints.append(model)
def __init__(self, args: Namespace): if args.computed_prop is not None: self.computed_prop = True if args.computed_prop == 'penalized_logp': self.scorer = penalized_logp elif args.computed_prop == 'logp': self.scorer = logp elif args.computed_prop == 'qed': self.scorer = qed elif args.computed_prop == 'sascore': self.scorer = sascore elif args.computed_prop == 'drd2': self.scorer = drd2 else: raise ValueError return self.computed_prop = False chemprop_paths = [] for root, _, files in os.walk(args.chemprop_dir): for fname in files: if fname.endswith('.pt'): chemprop_paths.append(os.path.join(root, fname)) self.scaler, self.features_scaler = load_scalers(chemprop_paths[0]) self.train_args = load_args(chemprop_paths[0]) if self.train_args.features_path is not None: self.train_args.features_path = None self.train_args.features_generator = ['rdkit_2d_normalized' ] # just assume this self.num_tasks = self.train_args.num_tasks self.batch_size = args.batch_size * 4 self.features_generator = get_features_generator( args.features_generator[0] ) if args.features_generator is not None else None self.neg_threshold = args.neg_threshold self.prop_index = args.prop_index self.chemprop_models = [] for checkpoint_path in chemprop_paths: self.chemprop_models.append( load_checkpoint(checkpoint_path, cuda=True))
def __init__(self, args: InterpretArgs) -> None: """ :param args: A :class:`~chemprop.args.InterpretArgs` object containing arguments for interpretation. """ self.args = args self.train_args = load_args(args.checkpoint_paths[0]) # If features were used during training, they must be used when predicting if ((self.train_args.features_path is not None or self.train_args.features_generator is not None) and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with --features_generator <generator> ' 'and using --no_features_scaling if applicable).') if self.train_args.atom_descriptors_size > 0 or self.train_args.atom_features_size > 0 or self.train_args.bond_features_size > 0: raise NotImplementedError( 'The interpret function does not yet work with additional atom or bond features' ) self.scaler, self.features_scaler, self.atom_descriptor_scaler, self.bond_feature_scaler = load_scalers( args.checkpoint_paths[0]) self.checkpoints = [ load_checkpoint(checkpoint_path, device=args.device) for checkpoint_path in args.checkpoint_paths ]
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=args) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') #for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(args.checkpoint_path, cuda=args.cuda) test_preds, test_smiles_batch = predict(model=model, data=test_data, batch_size=args.batch_size, scaler=scaler) return test_preds, test_smiles_batch '''
def visualize_encoding_property_space(args: Namespace): # Load data data = get_data(args.data_path) # Sort according to similarity measure if args.similarity_measure == 'property': data.sort(key=lambda d: d.targets[args.task_index]) elif args.similarity_measure == 'random': data.shuffle(args.seed) else: raise ValueError( 'similarity_measure "{}" not supported or not implemented yet.'. format(args.similarity_measure)) # Load model and scalers model = load_checkpoint(args.checkpoint_path) scaler, features_scaler = load_scalers(args.checkpoint_path) data.normalize_features(features_scaler) # Random seed if args.seed is not None: random.seed(args.seed) # Generate visualizations for i in trange(args.num_examples): # Get random three molecules with similar properties index = random.randint(1, len(data) - 2) molecules = MoleculeDataset(data[index - 1:index + 2]) molecule_targets = [t[args.task_index] for t in molecules.targets()] # Encode three molecules molecule_encodings = model.encoder(molecules.smiles()) # Define interpolation def predict_property(point: List[int]) -> float: # Return true value on endpoints of triangle argmax = np.argmax(point) if point[argmax] == 1: return molecule_targets[argmax] # Interpolate and predict task value encoding = sum(point[j] * molecule_encodings[j] for j in range(len(molecule_encodings))) pred = model.ffn(encoding).data.cpu().numpy() pred = scaler.inverse_transform(pred) pred = pred.item() return pred # Create visualization scale = 20 fontsize = 6 figure, tax = ternary.figure(scale=scale) tax.heatmapf(predict_property, boundary=True, style="hexagonal") tax.set_title("Property Prediction") tax.right_axis_label('{} ({:.6f}) -->'.format( molecules[0].smiles, molecules[0].targets[args.task_index]), fontsize=fontsize) tax.left_axis_label('{} ({:.6f}) -->'.format( molecules[1].smiles, molecules[1].targets[args.task_index]), fontsize=fontsize) tax.bottom_axis_label('<-- {} ({:.6f})'.format( molecules[2].smiles, molecules[2].targets[args.task_index]), fontsize=fontsize) tax.savefig(os.path.join(args.save_dir, '{}.png'.format(i)))
def make_predictions(args: Namespace, smiles: List[str] = None, invalid_smiles_warning: str = None) -> List[List[float]]: """Makes predictions.""" if args.gpu is not None: torch.cuda.set_device(args.gpu) if invalid_smiles_warning is not None: success_indices = [] for i, s in enumerate(smiles): mol = Chem.MolFromSmiles(s) if mol is not None: success_indices.append(i) full_smiles = smiles smiles = [smiles[i] for i in success_indices] print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles) else: test_data = get_data(args.test_path, args, use_compound_names=args.compound_names) test_smiles = test_data.smiles() if args.compound_names: compound_names = test_data.compound_names() print('Test size = {:,}'.format(len(test_data))) # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions sum_preds = np.zeros((len(test_data), args.num_tasks)) print('Predicting with an ensemble of {} models'.format( len(args.checkpoint_paths))) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict(model=model, data=test_data, args=args, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / args.ensemble_size avg_preds = avg_preds.tolist() # Save predictions assert len(test_data) == len(avg_preds) print('Saving predictions to {}'.format(args.preds_path)) with open(args.preds_path, 'w') as f: if args.write_smiles: f.write('smiles,') if args.compound_names: f.write('compound_name,') f.write(','.join(args.task_names) + '\n') for i in range(len(avg_preds)): if args.write_smiles: f.write(test_smiles[i] + ',') if args.compound_names: f.write(compound_names[i] + ',') f.write(','.join(str(p) for p in avg_preds[i]) + '\n') if invalid_smiles_warning is not None: full_preds = [[invalid_smiles_warning] for _ in range(len(full_smiles))] for i, si in enumerate(success_indices): full_preds[si] = avg_preds[i] return full_preds return avg_preds
def make_predictions(args: PredictArgs, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, args=args, target_columns=[], skip_invalid_smiles=False) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Normalize features if args.features_scaling: test_data.normalize_features(features_scaler) # Initialize uncertainty estimator if args.uncertainty: uncertainty_estimator = uncertainty_estimator_builder( args.uncertainty)(args, test_data, scaler) # Predict with each model individually and sum predictions if not args.uncertainty: if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for N, checkpoint_path in tqdm(enumerate(args.checkpoint_paths), total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, device=args.device) model.training = False if not args.uncertainty: model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) else: uncertainty_estimator.process_model(model, N) # Ensemble predictions if not args.uncertainty: avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() else: avg_preds, avg_UQ = uncertainty_estimator.calculate_UQ() if type(avg_UQ) is tuple: aleatoric, epistemic = avg_UQ # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) if args.uncertainty: if not args.split_UQ: cur_UQ = avg_UQ[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) datapoint.row['Uncertainty'] = cur_UQ elif args.split_UQ: cur_al = aleatoric[ valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) cur_ep = epistemic[ valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) datapoint.row['Aleatoric'] = cur_al datapoint.row['Epistemic'] = cur_ep if type(preds) is list: for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred else: datapoint.row[task_names[0]] = preds # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print("Loading training args") train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names update_prediction_args(predict_args=args, train_args=train_args) args: Union[PredictArgs, TrainArgs] if args.atom_descriptors == "feature": set_extra_atom_fdim(train_args.atom_features_size) if args.bond_features_path is not None: set_extra_bond_fdim(train_args.bond_features_size) # set explicit H option and reaction option set_explicit_h(train_args.explicit_h) set_reaction(train_args.reaction, train_args.reaction_mode) print("Loading data") if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator, ) else: full_data = get_data( path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=not args.drop_extra_columns, ) print("Validating SMILES") full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f"Test size = {len(test_data):,}") # Predict with each model individually and sum predictions if args.dataset_type == "multiclass": sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader( dataset=test_data, batch_size=args.batch_size, num_workers=0 if sys.platform == "darwin" else args.num_workers, ) # Partial results for variance robust calculation. if args.ensemble_variance: all_preds = np.zeros( (len(test_data), num_tasks, len(args.checkpoint_paths))) print( f"Predicting with an ensemble of {len(args.checkpoint_paths)} models") for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) ( scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, ) = load_scalers(checkpoint_path) # Normalize features if (args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling): test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if (train_args.atom_descriptor_scaling and args.atom_descriptors is not None): test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) if args.ensemble_variance: all_preds[:, :, index] = model_preds # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() if args.ensemble_variance: all_epi_uncs = np.var(all_preds, axis=2) all_epi_uncs = all_epi_uncs.tolist() # Save predictions print(f"Saving predictions to {args.preds_path}") assert len(test_data) == len(avg_preds) if args.ensemble_variance: assert len(test_data) == len(all_epi_uncs) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == "multiclass": task_names = [ f"{name}_class_{i}" for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = (avg_preds[valid_index] if valid_index is not None else ["Invalid SMILES"] * len(task_names)) if args.ensemble_variance: epi_uncs = (all_epi_uncs[valid_index] if valid_index is not None else ["Invalid SMILES"] * len(task_names)) # If extra columns have been dropped, add back in SMILES columns if args.drop_extra_columns: datapoint.row = OrderedDict() smiles_columns = args.smiles_columns for column, smiles in zip(smiles_columns, datapoint.smiles): datapoint.row[column] = smiles # Add predictions columns if args.ensemble_variance: for pred_name, pred, epi_unc in zip(task_names, preds, epi_uncs): datapoint.row[pred_name] = pred datapoint.row[pred_name + "_epi_unc"] = epi_unc else: for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, "w") as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def molecule_fingerprint( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to encode fingerprint vectors for the data. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of fingerprint vectors (list of floats) """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments update_prediction_args(predict_args=args, train_args=train_args, validate_feature_sources=False) args: Union[PredictArgs, TrainArgs] #set explicit H option and reaction option set_explicit_h(train_args.explicit_h) set_reaction(train_args.reaction, train_args.reaction_mode) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) # Load model print(f'Encoding smiles into a fingerprint vector from a single model') if len(args.checkpoint_paths) != 1: raise ValueError( "Fingerprint generation only supports one model, cannot use an ensemble" ) model = load_checkpoint(args.checkpoint_paths[0], device=args.device) scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers( args.checkpoint_paths[0]) # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make fingerprints model_preds = model_fingerprint(model=model, data_loader=test_data_loader) # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(model_preds) makedirs(args.preds_path, isfile=True) # Copy predictions over to full_data total_hidden_size = args.hidden_size * args.number_of_molecules for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = model_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * total_hidden_size fingerprint_columns = [f'fp_{i}' for i in range(total_hidden_size)] for i in range(len(fingerprint_columns)): datapoint.row[fingerprint_columns[i]] = preds[i] # Write predictions with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=args.smiles_columns + fingerprint_columns, extrasaction='ignore') writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return model_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=args) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) sum_ale_uncs = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) sum_epi_uncs = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) sum_ale_uncs = np.zeros((len(test_data), args.num_tasks)) sum_epi_uncs = np.zeros((len(test_data), args.num_tasks)) # Partial results for variance robust calculation. all_preds = np.zeros( (len(test_data), args.num_tasks, len(args.checkpoint_paths))) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds, ale_uncs, epi_uncs = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler, sampling_size=args.sampling_size) sum_preds += np.array(model_preds) if ale_uncs is not None: sum_ale_uncs += np.array(ale_uncs) if epi_uncs is not None: sum_epi_uncs += np.array(epi_uncs) if args.estimate_variance: all_preds[:, :, index] = model_preds print('\nmodel_preds\n', model_preds) print('ale_uncs\n', ale_uncs) # Ensemble predictions if args.estimate_variance: # Use ensemble variance to estimate uncertainty. This overwrites existing uncertainty estimates. # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- var(preds) avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths) avg_ale_uncs = avg_ale_uncs.tolist() avg_epi_uncs = np.var(all_preds, axis=2) avg_epi_uncs = avg_epi_uncs.tolist() else: # Use another method to estimate uncertainty. # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- mean(epi_uncs) avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths) avg_ale_uncs = avg_ale_uncs.tolist() avg_epi_uncs = sum_epi_uncs / len(args.checkpoint_paths) avg_epi_uncs = avg_epi_uncs.tolist() # Save predictions assert len(test_data) == len(avg_preds) assert len(test_data) == len(avg_ale_uncs) assert len(test_data) == len(avg_epi_uncs) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = [None] * len(full_data) full_ale_uncs = [None] * len(full_data) full_epi_uncs = [None] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] full_ale_uncs[si] = avg_ale_uncs[i] full_epi_uncs[si] = avg_epi_uncs[i] avg_preds = full_preds avg_ale_uncs = full_ale_uncs avg_epi_uncs = full_epi_uncs test_smiles = full_data.smiles() # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = [] if args.use_compound_names: header.append('compound_names') header.append('smiles') if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) header.extend([tn + "_ale_unc" for tn in args.task_names]) header.extend([tn + "_epi_unc" for tn in args.task_names]) writer.writerow(header) for i in range(len(avg_preds)): row = [] if args.use_compound_names: row.append(compound_names[i]) row.append(test_smiles[i]) if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) row.extend(avg_ale_uncs[i]) row.extend(avg_epi_uncs[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: # Both the prediction, the aleatoric uncertainty and the epistemic uncertainty are None row.extend([''] * 3 * args.num_tasks) writer.writerow(row) return avg_preds
def make_predictions(args: PredictArgs, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] if args.parcel_size and args.max_data_size: num_iterations = math.ceil(args.max_data_size / args.parcel_size) max_data_size = args.parcel_size print('Using parcels: ' + str(num_iterations)) else: num_iterations = 1 max_data_size = args.max_data_size print('Not using parcels.') if args.parcel_offset: offset = args.parcel_offset * parcel_size else: offset = 0 for iteration in range(num_iterations): print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: print("Getting without SMILES") full_data = get_data(path=args.test_path, args=args, target_columns=[], max_data_size=max_data_size, data_offset=offset, skip_invalid_smiles=False) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Normalize features if args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models' ) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, device=args.device) model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions if iteration != 0: name, ext = os.path.splitext(args.preds_path) preds_path = "{name}.{it}.csv".format(name=name, it=iteration) else: preds_path = args.preds_path print(f'Saving predictions to {preds_path}') assert len(test_data) == len(avg_preds) makedirs(preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred with open(preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) offset = offset + parcel_size return avg_preds
def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler = load_scalers(checkpoint_path) # Normalize features if args.features_scaling: test_data.reset_features_and_targets() test_data.normalize_features(features_scaler) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def molecule_fingerprint( args: FingerprintArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to encode fingerprint vectors for the data. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of fingerprint vectors (list of floats) """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments if args.fingerprint_type == 'MPN': # only need to supply input features if using FFN latent representation and if model calls for them. validate_feature_sources = False else: validate_feature_sources = True update_prediction_args(predict_args=args, train_args=train_args, validate_feature_sources=validate_feature_sources) args: Union[FingerprintArgs, TrainArgs] #set explicit H option and reaction option reset_featurization_parameters() if args.atom_descriptors == 'feature': set_extra_atom_fdim(train_args.atom_features_size) if args.bond_features_path is not None: set_extra_bond_fdim(train_args.bond_features_size) set_explicit_h(train_args.explicit_h) set_adding_hs(args.adding_h) if train_args.reaction: set_reaction(train_args.reaction, train_args.reaction_mode) elif train_args.reaction_solvent: set_reaction(True, train_args.reaction_mode) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) # Set fingerprint size if args.fingerprint_type == 'MPN': if args.atom_descriptors == "descriptor": # special case when we have 'descriptor' extra dimensions need to be added total_fp_size = ( args.hidden_size + test_data.atom_descriptors_size()) * args.number_of_molecules else: if args.reaction_solvent: total_fp_size = args.hidden_size + args.hidden_size_solvent else: total_fp_size = args.hidden_size * args.number_of_molecules if args.features_only: raise ValueError( 'With features_only models, there is no latent MPN representation. Use last_FFN fingerprint type instead.' ) elif args.fingerprint_type == 'last_FFN': if args.ffn_num_layers != 1: total_fp_size = args.ffn_hidden_size else: raise ValueError( 'With a ffn_num_layers of 1, there is no latent FFN representation. Use MPN fingerprint type instead.' ) else: raise ValueError( f'Fingerprint type {args.fingerprint_type} not supported') all_fingerprints = np.zeros( (len(test_data), total_fp_size, len(args.checkpoint_paths))) # Load model print( f'Encoding smiles into a fingerprint vector from {len(args.checkpoint_paths)} models.' ) for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers( args.checkpoint_paths[index]) # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make fingerprints model_fp = model_fingerprint(model=model, data_loader=test_data_loader, fingerprint_type=args.fingerprint_type) if args.fingerprint_type == 'MPN' and ( args.features_path is not None or args.features_generator ): # truncate any features from MPN fingerprint model_fp = np.array(model_fp)[:, :total_fp_size] all_fingerprints[:, :, index] = model_fp # Save predictions print(f'Saving predictions to {args.preds_path}') # assert len(test_data) == len(all_fingerprints) #TODO: add unit test for this makedirs(args.preds_path, isfile=True) # Set column names fingerprint_columns = [] if args.fingerprint_type == 'MPN': if len(args.checkpoint_paths) == 1: for j in range(total_fp_size // args.number_of_molecules): for k in range(args.number_of_molecules): fingerprint_columns.append(f'fp_{j}_mol_{k}') else: for j in range(total_fp_size // args.number_of_molecules): for i in range(len(args.checkpoint_paths)): for k in range(args.number_of_molecules): fingerprint_columns.append(f'fp_{j}_mol_{k}_model_{i}') else: # args == 'last_FNN' if len(args.checkpoint_paths) == 1: for j in range(total_fp_size): fingerprint_columns.append(f'fp_{j}') else: for j in range(total_fp_size): for i in range(len(args.checkpoint_paths)): fingerprint_columns.append(f'fp_{j}_model_{i}') # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = all_fingerprints[valid_index].reshape( (len(args.checkpoint_paths) * total_fp_size )) if valid_index is not None else ['Invalid SMILES'] * len( args.checkpoint_paths) * total_fp_size for i in range(len(fingerprint_columns)): datapoint.row[fingerprint_columns[i]] = preds[i] # Write predictions with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=args.smiles_columns + fingerprint_columns, extrasaction='ignore') writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return all_fingerprints
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, drug_scaler, cmpd_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [i for i in range(len(test_data)) if test_data[i].drug_mol is not None] full_data = test_data test_data = MolPairDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(drug_scaler, cmpd_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros((len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler # TODO: Shouldn't this be the custom scalers if avail? ) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions assert len(test_data) == len(avg_preds) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = [None] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] avg_preds = full_preds test_smiles = full_data.smiles() # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = ['drugSMILE', 'cmpdSMILE'] if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) writer.writerow(header) for i in range(len(avg_preds)): row = [test_smiles[i][0], test_smiles[i][1]] if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: row.extend([''] * args.num_tasks) writer.writerow(row) return avg_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) data = smiles # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') # if smiles is not None: # test_data = get_data_from_smiles_fast(smiles=smiles, skip_invalid_smiles=False) # else: # test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) with open(args.test_path, 'r') as f: smiles = list(map(lambda x: x.split(',')[0].strip(), f.readlines()[1:])) assert (smiles is not None) print('Validating SMILES') # # valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None] # full_data = test_data # test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # # # Edge case if empty list of smiles is provided # if len(test_data) == 0: # return [None] * len(full_data) # # if args.use_compound_names: # compound_names = test_data.compound_names() # print(f'Test size = {len(test_data):,}') # # # Normalize features # if train_args.features_scaling: # test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions # if args.dataset_type == 'multiclass': # sum_preds = np.zeros((len(smiles), args.num_tasks, args.multiclass_num_classes)) # else: # sum_preds = np.zeros((len(smiles), args.num_tasks)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) avg_preds = predict(model=model, data=smiles, batch_size=args.batch_size, scaler=scaler, args=args) # avg_preds += np.array(model_preds) # Ensemble predictions # avg_preds = sum_preds / len(args.checkpoint_paths) # avg_preds = avg_preds.tolist() # Save predictions print(len(smiles), len(avg_preds)) assert len(smiles) == len(avg_preds) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = avg_preds # for i, si in enumerate(valid_indices): # full_preds[si] = avg_preds[i] avg_preds = full_preds test_smiles = smiles # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = [] if args.use_compound_names: header.append('compound_names') header.append('smiles') if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) writer.writerow(header) for i in range(len(avg_preds)): row = [] # if args.use_compound_names: # row.append(compound_names[i]) row.append(test_smiles[i]) if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: row.extend([''] * args.num_tasks) writer.writerow(row) return avg_preds