def _get_data_from_df(data_df: pd.DataFrame, skip_invalid_smiles: bool=True, features_path: List[str]=None, features_generator: List[str] = None, max_data_size: int=None) -> MoleculeDataset: ''' Gets smiles string and target values (and optionally compound names if provided) from a CSV file. :param data_df: DataFrame from parsed CSV file. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles. :param args: Arguments. :param features_path: A list of paths to files containing features. If provided, it is used in place of args.features_path. :param features_generator: List of strings of features_generator names. :param max_data_size: The maximum number of data points to load. :return: A MoleculeDataset containing smiles strings and target values along with other info such as additional features and compound names when desired. ''' max_data_size = max_data_size or float('inf') # Load features: if features_path: features_data = [] for feat_path in features_path: # Each is num_data x num_features: features_data.append(load_features(feat_path)) features_data = np.concatenate(features_data, axis=1) else: features_data = None # Load data data = [] for idx, (smiles, targets) in \ enumerate(data_df.head(min(max_data_size, len(data_df))).iterrows()): if smiles: mol = Chem.MolFromSmiles(smiles) if not skip_invalid_smiles or (mol and mol.GetNumHeavyAtoms()): data.append(MoleculeDatapoint( smiles=smiles, mol=mol, targets=targets, features=features_data[idx] if features_data else None, features_generator=features_generator, compound_name=None )) return MoleculeDataset(data)
def test_predict_spectra(self, name: str, model_type: str, expected_score: float, expected_nans: int, train_flags: List[str] = None, predict_flags: List[str] = None): with TemporaryDirectory() as save_dir: # Train dataset_type = 'spectra' self.train(dataset_type=dataset_type, metric='sid', save_dir=save_dir, model_type=model_type, flags=train_flags) # Predict preds_path = os.path.join(save_dir, 'preds.csv') self.predict(dataset_type=dataset_type, preds_path=preds_path, save_dir=save_dir, model_type=model_type, flags=predict_flags) # Check results pred = pd.read_csv(preds_path) true = pd.read_csv(os.path.join(TEST_DATA_DIR, 'spectra.csv')) self.assertEqual(list(pred.keys()), list(true.keys())) self.assertEqual(list(pred['smiles']), list(true['smiles'])) pred, true = pred.drop(columns=['smiles']), true.drop( columns=['smiles']) pred, true = pred.to_numpy(), true.to_numpy() phase_features = load_features(predict_flags[1]) if '--spectra_phase_mask_path' in train_flags: mask = load_phase_mask(train_flags[5]) else: mask = None true = normalize_spectra(true, phase_features, mask) sid = evaluate_predictions(preds=pred, targets=true, num_tasks=len(true[0]), metrics=['sid'], dataset_type='spectra')['sid'][0] self.assertAlmostEqual(sid, expected_score, delta=DELTA * expected_score) self.assertEqual(np.sum(np.isnan(pred)), expected_nans)
def load_temp(temp_dir: str) -> Tuple[List[List[float]], int]: """ Loads all features saved as .npz files in load_dir. Assumes temporary files are named in order 0.npz, 1.npz, ... :param temp_dir: Directory in which temporary .npz files containing features are stored. :return: A tuple with a list of molecule features, where each molecule's features is a list of floats, and the number of temporary files. """ features = [] temp_num = 0 temp_path = os.path.join(temp_dir, f'{temp_num}.npz') while os.path.exists(temp_path): features.extend(load_features(temp_path)) temp_num += 1 temp_path = os.path.join(temp_dir, f'{temp_num}.npz') return features, temp_num
def get_data(path: str, smiles_column: str = None, target_columns: List[str] = None, ignore_columns: List[str] = None, skip_invalid_smiles: bool = True, args: Union[TrainArgs, PredictArgs] = None, features_path: List[str] = None, features_generator: List[str] = None, atom_descriptors_path: str = None, max_data_size: int = None, store_row: bool = False, logger: Logger = None, skip_none_targets: bool = False) -> MoleculeDataset: """ Gets SMILES and target values from a CSV file. :param path: Path to a CSV file. :param smiles_column: The name of the column containing SMILES. By default, uses the first column. :param target_columns: Name of the columns containing target values. By default, uses all columns except the :code:`smiles_column` and the :code:`ignore_columns`. :param ignore_columns: Name of the columns to ignore when :code:`target_columns` is not provided. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles using :func:`filter_invalid_smiles`. :param args: Arguments, either :class:`~chemprop.args.TrainArgs` or :class:`~chemprop.args.PredictArgs`. :param features_path: A list of paths to files containing features. If provided, it is used in place of :code:`args.features_path`. :param features_generator: A list of features generators to use. If provided, it is used in place of :code:`args.features_generator`. :param atom_descriptors_path: The path to the file containing the custom atom descriptors. :param max_data_size: The maximum number of data points to load. :param logger: A logger for recording output. :param store_row: Whether to store the raw CSV row in each :class:`~chemprop.data.data.MoleculeDatapoint`. :param skip_none_targets: Whether to skip targets that are all 'None'. This is mostly relevant when --target_columns are passed in, so only a subset of tasks are examined. :return: A :class:`~chemprop.data.MoleculeDataset` containing SMILES and target values along with other info such as additional features when desired. """ debug = logger.debug if logger is not None else print # Load atomic descriptors atom_features = None atom_descriptors = None if args is not None: # Prefer explicit function arguments but default to args if not provided smiles_column = smiles_column if smiles_column is not None else args.smiles_column target_columns = target_columns if target_columns is not None else args.target_columns ignore_columns = ignore_columns if ignore_columns is not None else args.ignore_columns features_path = features_path if features_path is not None else args.features_path features_generator = features_generator if features_generator is not None else args.features_generator atom_descriptors_path = atom_descriptors_path if atom_descriptors_path is not None \ else args.atom_descriptors_path max_data_size = max_data_size if max_data_size is not None else args.max_data_size if args.atom_descriptors == 'feature': atom_features = load_atom_features(atom_descriptors_path) elif args.atom_descriptors == 'descriptor': atom_descriptors = load_atom_features(atom_descriptors_path) max_data_size = max_data_size or float('inf') # Load features if features_path is not None: features_data = [] for feat_path in features_path: features_data.append( load_features(feat_path)) # each is num_data x num_features features_data = np.concatenate(features_data, axis=1) else: features_data = None skip_smiles = set() # Load data with open(path) as f: reader = csv.DictReader(f) columns = reader.fieldnames # By default, the SMILES column is the first column if smiles_column is None: smiles_column = columns[0] # By default, the targets columns are all the columns except the SMILES column if target_columns is None: ignore_columns = set([smiles_column] + ( [] if ignore_columns is None else ignore_columns)) target_columns = [ column for column in columns if column not in ignore_columns ] all_smiles, all_targets, all_rows, all_features = [], [], [], [] for i, row in tqdm(enumerate(reader)): smiles = row[smiles_column] if smiles in skip_smiles: continue targets = [ float(row[column]) if row[column] != '' else None for column in target_columns ] # Check whether all targets are None and skip if so if skip_none_targets and all(x is None for x in targets): continue all_smiles.append(smiles) all_targets.append(targets) if features_data is not None: all_features.append(features_data[i]) if store_row: all_rows.append(row) if len(all_smiles) >= max_data_size: break data = MoleculeDataset([ MoleculeDatapoint( smiles=smiles, targets=targets, row=all_rows[i] if store_row else None, features_generator=features_generator, features=all_features[i] if features_data is not None else None, atom_features=atom_features[i] if atom_features is not None else None, atom_descriptors=atom_descriptors[i] if atom_descriptors is not None else None, ) for i, (smiles, targets) in tqdm( enumerate(zip(all_smiles, all_targets)), total=len(all_smiles)) ]) # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) data = filter_invalid_smiles(data) if len(data) < original_data_len: debug( f'Warning: {original_data_len - len(data)} SMILES are invalid.' ) return data
def get_data(path: str, skip_invalid_smiles: bool = True, args: Namespace = None, features_path: List[str] = None, max_data_size: int = None, use_compound_names: bool = None, logger: Logger = None) -> MoleculeDataset: """ Gets smiles string and target values (and optionally compound names if provided) from a CSV file. :param path: Path to a CSV file. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles. :param args: Arguments. :param features_path: A list of paths to files containing features. If provided, it is used in place of args.features_path. :param max_data_size: The maximum number of data points to load. :param use_compound_names: Whether file has compound names in addition to smiles strings. :param logger: Logger. :return: A MoleculeDataset containing smiles strings and target values along with other info such as additional features and compound names when desired. """ debug = logger.debug if logger is not None else print if args is not None: # Prefer explicit function arguments but default to args if not provided features_path = features_path if features_path is not None else args.features_path max_data_size = max_data_size if max_data_size is not None else args.max_data_size use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names else: use_compound_names = False max_data_size = max_data_size or float('inf') # Load features if features_path is not None: features_data = [] for feat_path in features_path: # each is num_data x num_features features_data.append(load_features(feat_path)) features_data = np.concatenate(features_data, axis=1) else: features_data = None skip_smiles = set() # Load data with open(path) as f: reader = csv.reader(f) next(reader) # skip header lines = [] for line in reader: smiles = line[0] if smiles in skip_smiles: continue lines.append(line) if len(lines) >= max_data_size: break middle = [] for i, line in tqdm(enumerate(lines), total=len(lines)): temp = MoleculeDatapoint(line=line, args=args, features=features_data[i] if features_data is not None else None, use_compound_names=use_compound_names) middle.append(temp) data = MoleculeDataset(middle) '''' data = MoleculeDataset([ MoleculeDatapoint( line=line, args=args, features=features_data[i] if features_data is not None else None, use_compound_names=use_compound_names ) for i, line in tqdm(enumerate(lines), total=len(lines)) ]) ''' # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) data = filter_invalid_smiles(data) if len(data) < original_data_len: debug( f'Warning: {original_data_len - len(data)} SMILES are invalid.' ) if data.data[0].features is not None: args.features_dim = len(data.data[0].features) return data
def get_data(path: str, smiles_column: str = None, target_columns: List[str] = None, skip_invalid_smiles: bool = True, args: Union[PredictArgs, TrainArgs] = None, features_path: List[str] = None, features_generator: List[str] = None, max_data_size: int = None, logger: Logger = None) -> MoleculeDataset: """ Gets smiles string and target values (and optionally compound names if provided) from a CSV file. :param path: Path to a CSV file. :param smiles_column: The name of the column containing SMILES strings. By default, uses the first column. :param target_columns: Name of the columns containing target values. By default, uses all columns except the SMILES column. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles. :param args: Arguments. :param features_path: A list of paths to files containing features. If provided, it is used in place of args.features_path. :param features_generator: A list of features generators to use. If provided, it is used in place of args.features_generator. :param max_data_size: The maximum number of data points to load. :param logger: Logger. :return: A MoleculeDataset containing smiles strings and target values along with other info such as additional features and compound names when desired. """ debug = logger.debug if logger is not None else print if args is not None: # Prefer explicit function arguments but default to args if not provided smiles_column = smiles_column if smiles_column is not None else args.smiles_column target_columns = target_columns if target_columns is not None else args.target_columns features_path = features_path if features_path is not None else args.features_path features_generator = features_generator if features_generator is not None else args.features_generator max_data_size = max_data_size if max_data_size is not None else args.max_data_size max_data_size = max_data_size or float('inf') # Load features if features_path is not None: features_data = [] for feat_path in features_path: features_data.append( load_features(feat_path)) # each is num_data x num_features features_data = np.concatenate(features_data, axis=1) else: features_data = None skip_smiles = set() # Load data with open(path) as f: reader = csv.DictReader(f) columns = reader.fieldnames # By default, the SMILES column is the first column if smiles_column is None: smiles_column = columns[0] # By default, the targets columns are all the columns except the SMILES column if target_columns is None: target_columns = [ column for column in columns if column != smiles_column ] all_smiles, all_targets, all_rows = [], [], [] for row in reader: smiles = row[smiles_column] if smiles in skip_smiles: continue targets = [ float(row[column]) if row[column] != '' else None for column in target_columns ] all_smiles.append(smiles) all_targets.append(targets) all_rows.append(row) if len(all_smiles) >= max_data_size: break data = MoleculeDataset([ MoleculeDatapoint(smiles=smiles, targets=targets, row=row, features_generator=features_generator, features=features_data[i] if features_data is not None else None) for i, ( smiles, targets, row) in tqdm(enumerate(zip(all_smiles, all_targets, all_rows)), total=len(all_smiles)) ]) # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) data = filter_invalid_smiles(data) if len(data) < original_data_len: debug( f'Warning: {original_data_len - len(data)} SMILES are invalid.' ) return data
def get_data(path: str, smiles_columns: Union[str, List[str]] = None, target_columns: List[str] = None, ignore_columns: List[str] = None, skip_invalid_smiles: bool = True, args: Union[TrainArgs, PredictArgs] = None, data_weights_path: str = None, features_path: List[str] = None, features_generator: List[str] = None, phase_features_path: str = None, atom_descriptors_path: str = None, bond_features_path: str = None, max_data_size: int = None, store_row: bool = False, logger: Logger = None, loss_function: str = None, skip_none_targets: bool = False) -> MoleculeDataset: """ Gets SMILES and target values from a CSV file. :param path: Path to a CSV file. :param smiles_columns: The names of the columns containing SMILES. By default, uses the first :code:`number_of_molecules` columns. :param target_columns: Name of the columns containing target values. By default, uses all columns except the :code:`smiles_column` and the :code:`ignore_columns`. :param ignore_columns: Name of the columns to ignore when :code:`target_columns` is not provided. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles using :func:`filter_invalid_smiles`. :param args: Arguments, either :class:`~chemprop.args.TrainArgs` or :class:`~chemprop.args.PredictArgs`. :param data_weights_path: A path to a file containing weights for each molecule in the loss function. :param features_path: A list of paths to files containing features. If provided, it is used in place of :code:`args.features_path`. :param features_generator: A list of features generators to use. If provided, it is used in place of :code:`args.features_generator`. :param phase_features_path: A path to a file containing phase features as applicable to spectra. :param atom_descriptors_path: The path to the file containing the custom atom descriptors. :param bond_features_path: The path to the file containing the custom bond features. :param max_data_size: The maximum number of data points to load. :param logger: A logger for recording output. :param store_row: Whether to store the raw CSV row in each :class:`~chemprop.data.data.MoleculeDatapoint`. :param skip_none_targets: Whether to skip targets that are all 'None'. This is mostly relevant when --target_columns are passed in, so only a subset of tasks are examined. :param loss_function: The loss function to be used in training. :return: A :class:`~chemprop.data.MoleculeDataset` containing SMILES and target values along with other info such as additional features when desired. """ debug = logger.debug if logger is not None else print if args is not None: # Prefer explicit function arguments but default to args if not provided smiles_columns = smiles_columns if smiles_columns is not None else args.smiles_columns target_columns = target_columns if target_columns is not None else args.target_columns ignore_columns = ignore_columns if ignore_columns is not None else args.ignore_columns features_path = features_path if features_path is not None else args.features_path features_generator = features_generator if features_generator is not None else args.features_generator phase_features_path = phase_features_path if phase_features_path is not None else args.phase_features_path atom_descriptors_path = atom_descriptors_path if atom_descriptors_path is not None \ else args.atom_descriptors_path bond_features_path = bond_features_path if bond_features_path is not None \ else args.bond_features_path max_data_size = max_data_size if max_data_size is not None else args.max_data_size loss_function = loss_function if loss_function is not None else args.loss_function if not isinstance(smiles_columns, list): smiles_columns = preprocess_smiles_columns(path=path, smiles_columns=smiles_columns) max_data_size = max_data_size or float('inf') # Load features if features_path is not None: features_data = [] for feat_path in features_path: features_data.append(load_features(feat_path)) # each is num_data x num_features features_data = np.concatenate(features_data, axis=1) else: features_data = None if phase_features_path is not None: phase_features = load_features(phase_features_path) for d_phase in phase_features: if not (d_phase.sum() == 1 and np.count_nonzero(d_phase) == 1): raise ValueError('Phase features must be one-hot encoded.') if features_data is not None: features_data = np.concatenate((features_data,phase_features), axis=1) else: # if there are no other molecular features, phase features become the only molecular features features_data = np.array(phase_features) else: phase_features = None # Load data weights if data_weights_path is not None: data_weights = get_data_weights(data_weights_path) else: data_weights = None # By default, the targets columns are all the columns except the SMILES column if target_columns is None: target_columns = get_task_names( path=path, smiles_columns=smiles_columns, target_columns=target_columns, ignore_columns=ignore_columns, ) # Find targets provided as inequalities if loss_function == 'bounded_mse': gt_targets, lt_targets = get_inequality_targets(path=path, target_columns=target_columns) else: gt_targets, lt_targets = None, None # Load data with open(path) as f: reader = csv.DictReader(f) all_smiles, all_targets, all_rows, all_features, all_phase_features, all_weights, all_gt, all_lt = [], [], [], [], [], [], [], [] for i, row in enumerate(tqdm(reader)): smiles = [row[c] for c in smiles_columns] targets = [] for column in target_columns: value = row[column] if value in ['','nan']: targets.append(None) elif '>' in value or '<' in value: if loss_function == 'bounded_mse': targets.append(float(value.strip('<>'))) else: raise ValueError('Inequality found in target data. To use inequality targets (> or <), the regression loss function bounded_mse must be used.') else: targets.append(float(value)) # Check whether all targets are None and skip if so if skip_none_targets and all(x is None for x in targets): continue all_smiles.append(smiles) all_targets.append(targets) if features_data is not None: all_features.append(features_data[i]) if phase_features is not None: all_phase_features.append(phase_features[i]) if data_weights is not None: all_weights.append(data_weights[i]) if gt_targets is not None: all_gt.append(gt_targets[i]) if lt_targets is not None: all_lt.append(lt_targets[i]) if store_row: all_rows.append(row) if len(all_smiles) >= max_data_size: break atom_features = None atom_descriptors = None if args is not None and args.atom_descriptors is not None: try: descriptors = load_valid_atom_or_bond_features(atom_descriptors_path, [x[0] for x in all_smiles]) except Exception as e: raise ValueError(f'Failed to load or validate custom atomic descriptors or features: {e}') if args.atom_descriptors == 'feature': atom_features = descriptors elif args.atom_descriptors == 'descriptor': atom_descriptors = descriptors bond_features = None if args is not None and args.bond_features_path is not None: try: bond_features = load_valid_atom_or_bond_features(bond_features_path, [x[0] for x in all_smiles]) except Exception as e: raise ValueError(f'Failed to load or validate custom bond features: {e}') data = MoleculeDataset([ MoleculeDatapoint( smiles=smiles, targets=targets, row=all_rows[i] if store_row else None, data_weight=all_weights[i] if data_weights is not None else None, gt_targets=all_gt[i] if gt_targets is not None else None, lt_targets=all_lt[i] if lt_targets is not None else None, features_generator=features_generator, features=all_features[i] if features_data is not None else None, phase_features=all_phase_features[i] if phase_features is not None else None, atom_features=atom_features[i] if atom_features is not None else None, atom_descriptors=atom_descriptors[i] if atom_descriptors is not None else None, bond_features=bond_features[i] if bond_features is not None else None, overwrite_default_atom_features=args.overwrite_default_atom_features if args is not None else False, overwrite_default_bond_features=args.overwrite_default_bond_features if args is not None else False ) for i, (smiles, targets) in tqdm(enumerate(zip(all_smiles, all_targets)), total=len(all_smiles)) ]) # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) data = filter_invalid_smiles(data) if len(data) < original_data_len: debug(f'Warning: {original_data_len - len(data)} SMILES are invalid.') return data
def get_data(path: str, skip_invalid_smiles: bool = True, args: Namespace = None, features_path: List[str] = None, max_data_size: int = None, use_compound_names: bool = False, logger: Logger = None) -> MoleculeDataset: """ Gets smiles string and target values (and optionally compound names if provided) from a CSV file. :param path: Path to a CSV file. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles. :param args: Arguments. :param features_path: A list of paths to .pckl files containing features. If provided, it is used in place of args.features_path. :param max_data_size: The maximum number of data points to load. :param use_compound_names: Whether file has compound names in addition to smiles strings. :param logger: Logger. :return: A MoleculeDataset containing smiles strings and target values along with other info such as additional features and compound names when desired. """ debug = logger.debug if logger is not None else print if args is not None: max_data_size = min(args.max_data_size or float('inf'), max_data_size or float('inf')) skip_smiles_path = args.skip_smiles_path features_path = features_path or args.features_path else: skip_smiles_path = None max_data_size = max_data_size or float('inf') # Load features if features_path is not None: features_data = [] for feat_path in features_path: features_data.append(load_features(feat_path)) # each is num_data x num_features features_data = np.concatenate(features_data, axis=1) else: features_data = None # Load smiles to skip if skip_smiles_path is not None: with open(skip_smiles_path) as f: reader = csv.reader(f) next(reader) # skip header skip_smiles = {line[0] for line in reader} else: skip_smiles = set() # Load data with open(path) as f: reader = csv.reader(f) next(reader) # skip header lines = [] for line in reader: smiles = line[0] if smiles in skip_smiles: continue lines.append(line) if len(lines) >= max_data_size: break data = MoleculeDataset([ MoleculeDatapoint( line=line, args=args, features=features_data[i] if features_data is not None else None, use_compound_names=use_compound_names ) for i, line in tqdm(enumerate(lines), total=len(lines)) ]) # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) data = filter_invalid_smiles(data) if len(data) < original_data_len: debug(f'Warning: {original_data_len - len(data)} SMILES are invalid.') if data.data[0].features is not None: args.features_dim = len(data.data[0].features) if args is not None and args.dataset_type == 'regression_with_binning': data = convert_to_classes(data, args.num_bins) return data
def get_data(path: str, skip_invalid_smiles: bool = True, args: Namespace = None, features_path: List[str] = None, max_data_size: int = None, use_compound_names: bool = None, logger: Logger = None) -> MolPairDataset: """ Gets smiles string and target values (and optionally compound names if provided) from a CSV file. :param path: Path to a CSV file. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles. :param args: Arguments. :param features_path: A list of paths to files containing features. If provided, it is used in place of args.features_path. :param max_data_size: The maximum number of data points to load. :param use_compound_names: Whether file has compound names in addition to smiles strings. :param logger: Logger. :return: A MolPairDataset containing smiles strings and target values along with other info such as additional features and compound names when desired. """ debug = logger.debug if logger is not None else print if args is not None: # Prefer explicit function arguments but default to args if not provided features_path = features_path if features_path is not None else args.features_path max_data_size = max_data_size if max_data_size is not None else args.max_data_size use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names else: use_compound_names = False max_data_size = max_data_size or float('inf') # Load features if features_path is not None: for feat_path in features_path: debug(f'Loading features from {feat_path}') features_data = load_features( feat_path) # Expect file to look like {SMILES: feats} else: features_data = None skip_smiles = set() # Load data with open(path) as f: f.readline() # skip header lines = [] # drug_smile, cmpd_smile, targets, context for line in f.readlines(): line = line.strip().split(',') drug_smiles = line[0] cmpd_smiles = line[1] targets, context, feats1, feats2 = [], [], [], [] if drug_smiles in skip_smiles or cmpd_smiles in skip_smiles: continue for i in range(2, len(line)): if args.data_format is None or args.data_format[i] == 'P': targets.append(float(line[i]) if line[i] != '' else None) elif args.data_format[i] == '1': feats1.append(float(line[i]) if line[i] != '' else np.nan) elif args.data_format[i] == '2': feats2.append(float(line[i]) if line[i] != '' else np.nan) else: context.append(float(line[i]) if line[i] != '' else np.nan) lines.append( (drug_smiles, cmpd_smiles, targets, feats1, feats2, context)) if len(lines) >= max_data_size: break data = MolPairDataset([ MolPairDatapoint( drug_smiles=line[0], cmpd_smiles=line[1], targets=line[2], args=args, drug_feats=features_helper(line[0], features_data, line[3]), cmpd_feats=features_helper(line[1], features_data, line[4]), context=np.array(line[5]) if len(line[5]) > 0 else None) for i, line in tqdm(enumerate(lines), total=len(lines)) ]) # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) data = filter_invalid_smiles(data) if len(data) < original_data_len: debug( f'Warning: {original_data_len - len(data)} SMILES are invalid.' ) if args.use_input_features: args.features_dim = data.features_size() return data
def get_data( path: str, smiles_columns: Union[str, List[str]] = None, target_columns: List[str] = None, ignore_columns: List[str] = None, skip_invalid_smiles: bool = True, args: Union[TrainArgs, PredictArgs] = None, features_path: List[str] = None, features_generator: List[str] = None, atom_descriptors_path: str = None, bond_features_path: str = None, max_data_size: int = None, store_row: bool = False, logger: Logger = None, skip_none_targets: bool = False, ) -> MoleculeDataset: """ Gets SMILES and target values from a CSV file. :param path: Path to a CSV file. :param smiles_columns: The names of the columns containing SMILES. By default, uses the first :code:`number_of_molecules` columns. :param target_columns: Name of the columns containing target values. By default, uses all columns except the :code:`smiles_column` and the :code:`ignore_columns`. :param ignore_columns: Name of the columns to ignore when :code:`target_columns` is not provided. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles using :func:`filter_invalid_smiles`. :param args: Arguments, either :class:`~chemprop.args.TrainArgs` or :class:`~chemprop.args.PredictArgs`. :param features_path: A list of paths to files containing features. If provided, it is used in place of :code:`args.features_path`. :param features_generator: A list of features generators to use. If provided, it is used in place of :code:`args.features_generator`. :param atom_descriptors_path: The path to the file containing the custom atom descriptors. :param bond_features_path: The path to the file containing the custom bond features. :param max_data_size: The maximum number of data points to load. :param logger: A logger for recording output. :param store_row: Whether to store the raw CSV row in each :class:`~chemprop.data.data.MoleculeDatapoint`. :param skip_none_targets: Whether to skip targets that are all 'None'. This is mostly relevant when --target_columns are passed in, so only a subset of tasks are examined. :return: A :class:`~chemprop.data.MoleculeDataset` containing SMILES and target values along with other info such as additional features when desired. """ debug = logger.debug if logger is not None else print if args is not None: # Prefer explicit function arguments but default to args if not provided smiles_columns = smiles_columns if smiles_columns is not None else args.smiles_columns target_columns = target_columns if target_columns is not None else args.target_columns ignore_columns = ignore_columns if ignore_columns is not None else args.ignore_columns features_path = features_path if features_path is not None else args.features_path features_generator = (features_generator if features_generator is not None else args.features_generator) atom_descriptors_path = (atom_descriptors_path if atom_descriptors_path is not None else args.atom_descriptors_path) bond_features_path = (bond_features_path if bond_features_path is not None else args.bond_features_path) max_data_size = max_data_size if max_data_size is not None else args.max_data_size if not isinstance(smiles_columns, list): smiles_columns = preprocess_smiles_columns( path=path, smiles_columns=smiles_columns) max_data_size = max_data_size or float("inf") # Load features if features_path is not None: features_data = [] for feat_path in features_path: features_data.append( load_features(feat_path)) # each is num_data x num_features features_data = np.concatenate(features_data, axis=1) else: features_data = None # Load data with open(path) as f: reader = csv.DictReader(f) # By default, the targets columns are all the columns except the SMILES column if target_columns is None: target_columns = get_task_names( path=path, smiles_columns=smiles_columns, target_columns=target_columns, ignore_columns=ignore_columns, ) all_smiles, all_targets, all_rows, all_features = [], [], [], [] for i, row in enumerate(tqdm(reader)): smiles = [row[c] for c in smiles_columns] targets = [ float(row[column]) if row[column] != "" else None for column in target_columns ] # Check whether all targets are None and skip if so if skip_none_targets and all(x is None for x in targets): continue all_smiles.append(smiles) all_targets.append(targets) if features_data is not None: all_features.append(features_data[i]) if store_row: all_rows.append(row) if len(all_smiles) >= max_data_size: break atom_features = None atom_descriptors = None if args is not None and args.atom_descriptors is not None: try: descriptors = load_valid_atom_or_bond_features( atom_descriptors_path, [x[0] for x in all_smiles]) except Exception as e: raise ValueError( f"Failed to load or validate custom atomic descriptors or features: {e}" ) if args.atom_descriptors == "feature": atom_features = descriptors elif args.atom_descriptors == "descriptor": atom_descriptors = descriptors bond_features = None if args is not None and args.bond_features_path is not None: try: bond_features = load_valid_atom_or_bond_features( bond_features_path, [x[0] for x in all_smiles]) except Exception as e: raise ValueError( f"Failed to load or validate custom bond features: {e}") data = MoleculeDataset([ MoleculeDatapoint( smiles=smiles, targets=targets, row=all_rows[i] if store_row else None, features_generator=features_generator, features=all_features[i] if features_data is not None else None, atom_features=atom_features[i] if atom_features is not None else None, atom_descriptors=atom_descriptors[i] if atom_descriptors is not None else None, bond_features=bond_features[i] if bond_features is not None else None, overwrite_default_atom_features=args. overwrite_default_atom_features if args is not None else False, overwrite_default_bond_features=args. overwrite_default_bond_features if args is not None else False, ) for i, (smiles, targets) in tqdm( enumerate(zip(all_smiles, all_targets)), total=len(all_smiles)) ]) # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) data = filter_invalid_smiles(data) if len(data) < original_data_len: debug( f"Warning: {original_data_len - len(data)} SMILES are invalid." ) return data
def get_data(path: str, args: Namespace = None, max_data_size: int = None, use_compound_names: bool = False) -> MoleculeDataset: """ Gets smiles string and target values (and optionally compound names if provided) from a CSV file. :param path: Path to a CSV file. :param args: Arguments. :param max_data_size: The maximum number of data points to load. :param use_compound_names: Whether file has compound names in addition to smiles strings. :return: A MoleculeDataset containing smiles strings and target values along with other info such as additional features and compound names when desired. """ if args is not None: max_data_size = min(args.max_data_size or float('inf'), max_data_size or float('inf')) skip_smiles_path = args.skip_smiles_path if args.features_path: features_data = [] for features_path in args.features_path: features_data.append(load_features( features_path)) # each is num_data x num_features features_data = np.concatenate(features_data, axis=1) else: features_data = None else: features_data = skip_smiles_path = None max_data_size = max_data_size or float('inf') # Load smiles to skip if skip_smiles_path is not None: with open(skip_smiles_path) as f: f.readline() # skip header skip_smiles = {line.split(',')[0] for line in f} else: skip_smiles = set() # Load data with open(path) as f: f.readline() # skip header lines = [] for line in f: smiles = line.split(',')[0] if smiles == '': line = 'C' + line # bandage for if your dataset has an empty line on rare occasions if smiles in skip_smiles: continue lines.append(line) if len(lines) >= max_data_size: break data = MoleculeDataset([ MoleculeDatapoint( line=line.strip().split(','), args=args, features=features_data[i] if features_data is not None else None, use_compound_names=use_compound_names, ) for i, line in tqdm(enumerate(lines), total=len(lines)) ]) if data.data[0].features is not None: args.features_dim = len(data.data[0].features) if args is not None and args.dataset_type == 'regression_with_binning': data = convert_to_classes(data, args.num_bins) return data