def __call__(self, smiles: List[str], batch_size: int = 500) -> List[List[float]]: test_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=self.args.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = [] for model in self.checkpoints: model_preds = predict(model=model, data_loader=test_data_loader, scaler=self.scaler, disable_progress_bar=True) sum_preds.append(np.array(model_preds)) # Ensemble predictions sum_preds = sum(sum_preds) avg_preds = sum_preds / len(self.checkpoints) return avg_preds
def __call__(self, smiles, batch_size=500): test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=self.train_args) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) sum_preds = [] for model in self.checkpoints: model_preds = predict(model=model, data=test_data, batch_size=batch_size, scaler=self.scaler, disable_progress_bar=True) sum_preds.append(np.array(model_preds)) # Ensemble predictions sum_preds = sum(sum_preds) avg_preds = sum_preds / len(self.checkpoints) return avg_preds
def __call__(self, smiles, batch_size=500): test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=self.train_args) valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) sum_preds = np.zeros((len(test_data), 1)) for model in self.checkpoints: model_preds = predict( model=model, data=test_data, batch_size=batch_size, scaler=self.scaler ) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(self.checkpoints) avg_preds = avg_preds.squeeze(-1).tolist() # Put zero for invalid smiles full_preds = [0.0] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] return np.array(full_preds, dtype=np.float32)
def gcnn_predict(self) -> Tuple[array, array]: """ Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors Parameters: rdkit_mols (array): a numpy array containing RDKit molecules Returns: predictions, prediction_labels (Tuple[array, array]): predictions and labels """ smiles = self.rdkit_mols.tolist() full_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=50, num_workers=0) model_preds = predict(model=rlm_gcnn_model, data_loader=test_data_loader, scaler=rlm_gcnn_scaler) predictions = np.ma.empty(len(full_data)) predictions.mask = True labels = np.ma.empty(len(full_data)) labels.mask = True for key in full_to_valid_indices.keys(): full_index = int(key) predictions[full_index] = model_preds[ full_to_valid_indices[key]][0] labels[full_index] = np.round( model_preds[full_to_valid_indices[key]][0], 0) self.predictions_df['GCNN'] = pd.Series( pd.Series(labels).fillna('').astype(str) + ' (' + pd.Series(predictions).round(2).astype(str) + ')').str.replace( '(nan)', '', regex=False) if len(self.predictions_df.index) > len( predictions) or np.ma.count_masked(predictions) > 0: self.model_errors.append('graph convolutional neural network') self.has_errors = True return predictions, labels
def __call__(self, smiles: List[str], batch_size: int = 500) -> List[List[float]]: """ Makes predictions on a list of SMILES. :param smiles: A list of SMILES to make predictions on. :param batch_size: The batch size. :return: A list of lists of floats containing the predicted values. """ test_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=self.args.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) if self.train_args.atom_descriptor_scaling and self.args.atom_descriptors is not None: test_data.normalize_features(self.atom_descriptor_scaler, scale_atom_descriptors=True) if self.train_args.bond_feature_scaling and self.args.bond_features_size > 0: test_data.normalize_features(self.bond_feature_scaler, scale_bond_features=True) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = [] for model in self.checkpoints: model_preds = predict(model=model, data_loader=test_data_loader, scaler=self.scaler, disable_progress_bar=True) sum_preds.append(np.array(model_preds)) # Ensemble predictions sum_preds = sum(sum_preds) avg_preds = sum_preds / len(self.checkpoints) return avg_preds
def run_training(args, save_dir): tgt_data, val_data, test_data, src_data = prepare_data(args) inv_model = prepare_model(args) print('invariant', inv_model) optimizer = build_optimizer(inv_model, args) scheduler = build_lr_scheduler(optimizer, args) inv_opt = (optimizer, scheduler) loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) best_score = float('inf') if args.minimize_score else -float('inf') best_epoch = 0 for epoch in range(args.epochs): print(f'Epoch {epoch}') train(inv_model, src_data, tgt_data, loss_func, inv_opt, args) val_scores = evaluate(inv_model, val_data, args.num_tasks, metric_func, args.batch_size, args.dataset_type) avg_val_score = np.nanmean(val_scores) print(f'Validation {args.metric} = {avg_val_score:.4f}') if args.minimize_score and avg_val_score < best_score or not args.minimize_score and avg_val_score > best_score: best_score, best_epoch = avg_val_score, epoch save_checkpoint(os.path.join(save_dir, 'model.pt'), inv_model, args=args) print(f'Loading model checkpoint from epoch {best_epoch}') model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda) test_smiles, test_targets = test_data.smiles(), test_data.targets() test_preds = predict(model, test_data, args.batch_size) test_scores = evaluate_predictions(test_preds, test_targets, args.num_tasks, metric_func, args.dataset_type) avg_test_score = np.nanmean(test_scores) print(f'Test {args.metric} = {avg_test_score:.4f}') return avg_test_score
def __call__(self, smiles, batch_size=500): test_data = get_data_from_smiles( smiles=[[s] for s in smiles], skip_invalid_smiles=False, features_generator=self.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol[0] is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = np.zeros((len(test_data), 1)) for model, scaler, features_scaler in zip(self.checkpoints, self.scalers, self.features_scalers): test_data.reset_features_and_targets() if features_scaler is not None: test_data.normalize_features(features_scaler) model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(self.checkpoints) avg_preds = avg_preds.squeeze(-1).tolist() # Put zero for invalid smiles full_preds = [0.0] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] return np.array(full_preds, dtype=np.float32)
def gcnn_predict(self, model, scaler) -> Tuple[array, array]: """ Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors Parameters: models (model): model scaler (scalar): scalar Returns: predictions, prediction_labels (Tuple[array, array]): predictions and labels """ smiles = self.kekule_smiles.tolist() full_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 data = MoleculeDataset([full_data[i] for i in sorted(full_to_valid_indices.keys())]) # create data loader data_loader = MoleculeDataLoader( dataset=data, batch_size=50, num_workers=0 ) model_preds = predict( model=model, data_loader=data_loader, scaler=scaler ) predictions = np.ma.empty(len(full_data)) predictions.mask = True labels = np.ma.empty(len(full_data), dtype=np.int32) labels.mask = True for key in full_to_valid_indices.keys(): full_index = int(key) predictions[full_index] = model_preds[full_to_valid_indices[key]][0] labels[full_index] = np.round(model_preds[full_to_valid_indices[key]][0], 0) if self.smiles is not None: dt = datetime.datetime.now(timezone.utc) utc_time = dt.replace(tzinfo=timezone.utc) utc_timestamp = utc_time.timestamp() self.raw_predictions_df = self.raw_predictions_df.append( pd.DataFrame( { 'SMILES': self.smiles, 'model': self.model_name, 'prediction': predictions, 'timestamp': utc_timestamp } ), ignore_index = True ) # if self.interpret == True: # intrprt_df = get_interpretation(self.smiles, self.model_name) # else: # col_names = ['smiles', 'rationale_smiles', 'rationale_score'] # intrprt_df = pd.DataFrame(columns = col_names) #self.predictions_df['smiles'] = pd.Series(np.where(intrprt_df['rationale_scores']>0, intrprt_df['smiles'] + '_' + intrprt_df['rationale_smiles'], intrprt_df['smiles'])) self.predictions_df[self.column_dict_key] = pd.Series(pd.Series(labels).fillna('').astype(str) + ' (' + pd.Series(np.where(predictions>=0.5, predictions, (1 - predictions))).round(2).astype(str) + ')').str.replace('(nan)', '', regex=False) if len(self.predictions_df.index) > len(predictions) or np.ma.count_masked(predictions) > 0: self.model_errors.append('graph convolutional neural network') self.has_errors = True return predictions, labels