Exemplo n.º 1
0
def _get_data_from_df(data_df: pd.DataFrame,
                      skip_invalid_smiles: bool=True,
                      features_path: List[str]=None,
                      features_generator: List[str] = None,
                      max_data_size: int=None) -> MoleculeDataset:
    '''
    Gets smiles string and target values (and optionally compound names if
    provided) from a CSV file.

    :param data_df: DataFrame from parsed CSV file.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles.
    :param args: Arguments.
    :param features_path: A list of paths to files containing features.
    If provided, it is used in place of args.features_path.
    :param features_generator: List of strings of features_generator names.
    :param max_data_size: The maximum number of data points to load.
    :return: A MoleculeDataset containing smiles strings and target values
    along with other info such as additional features and compound names when
    desired.
    '''
    max_data_size = max_data_size or float('inf')

    # Load features:
    if features_path:
        features_data = []

        for feat_path in features_path:
            # Each is num_data x num_features:
            features_data.append(load_features(feat_path))

        features_data = np.concatenate(features_data, axis=1)
    else:
        features_data = None

    # Load data
    data = []

    for idx, (smiles, targets) in \
            enumerate(data_df.head(min(max_data_size,
                                       len(data_df))).iterrows()):

        if smiles:
            mol = Chem.MolFromSmiles(smiles)

        if not skip_invalid_smiles or (mol and mol.GetNumHeavyAtoms()):
            data.append(MoleculeDatapoint(
                smiles=smiles,
                mol=mol,
                targets=targets,
                features=features_data[idx] if features_data else None,
                features_generator=features_generator,
                compound_name=None
            ))

    return MoleculeDataset(data)
Exemplo n.º 2
0
    def test_predict_spectra(self,
                             name: str,
                             model_type: str,
                             expected_score: float,
                             expected_nans: int,
                             train_flags: List[str] = None,
                             predict_flags: List[str] = None):
        with TemporaryDirectory() as save_dir:
            # Train
            dataset_type = 'spectra'
            self.train(dataset_type=dataset_type,
                       metric='sid',
                       save_dir=save_dir,
                       model_type=model_type,
                       flags=train_flags)

            # Predict
            preds_path = os.path.join(save_dir, 'preds.csv')
            self.predict(dataset_type=dataset_type,
                         preds_path=preds_path,
                         save_dir=save_dir,
                         model_type=model_type,
                         flags=predict_flags)

            # Check results
            pred = pd.read_csv(preds_path)
            true = pd.read_csv(os.path.join(TEST_DATA_DIR, 'spectra.csv'))
            self.assertEqual(list(pred.keys()), list(true.keys()))
            self.assertEqual(list(pred['smiles']), list(true['smiles']))

            pred, true = pred.drop(columns=['smiles']), true.drop(
                columns=['smiles'])
            pred, true = pred.to_numpy(), true.to_numpy()
            phase_features = load_features(predict_flags[1])
            if '--spectra_phase_mask_path' in train_flags:
                mask = load_phase_mask(train_flags[5])
            else:
                mask = None
            true = normalize_spectra(true, phase_features, mask)
            sid = evaluate_predictions(preds=pred,
                                       targets=true,
                                       num_tasks=len(true[0]),
                                       metrics=['sid'],
                                       dataset_type='spectra')['sid'][0]
            self.assertAlmostEqual(sid,
                                   expected_score,
                                   delta=DELTA * expected_score)
            self.assertEqual(np.sum(np.isnan(pred)), expected_nans)
Exemplo n.º 3
0
def load_temp(temp_dir: str) -> Tuple[List[List[float]], int]:
    """
    Loads all features saved as .npz files in load_dir.

    Assumes temporary files are named in order 0.npz, 1.npz, ...

    :param temp_dir: Directory in which temporary .npz files containing features are stored.
    :return: A tuple with a list of molecule features, where each molecule's features is a list of floats,
    and the number of temporary files.
    """
    features = []
    temp_num = 0
    temp_path = os.path.join(temp_dir, f'{temp_num}.npz')

    while os.path.exists(temp_path):
        features.extend(load_features(temp_path))
        temp_num += 1
        temp_path = os.path.join(temp_dir, f'{temp_num}.npz')

    return features, temp_num
Exemplo n.º 4
0
def get_data(path: str,
             smiles_column: str = None,
             target_columns: List[str] = None,
             ignore_columns: List[str] = None,
             skip_invalid_smiles: bool = True,
             args: Union[TrainArgs, PredictArgs] = None,
             features_path: List[str] = None,
             features_generator: List[str] = None,
             atom_descriptors_path: str = None,
             max_data_size: int = None,
             store_row: bool = False,
             logger: Logger = None,
             skip_none_targets: bool = False) -> MoleculeDataset:
    """
    Gets SMILES and target values from a CSV file.

    :param path: Path to a CSV file.
    :param smiles_column: The name of the column containing SMILES. By default, uses the first column.
    :param target_columns: Name of the columns containing target values. By default, uses all columns
                           except the :code:`smiles_column` and the :code:`ignore_columns`.
    :param ignore_columns: Name of the columns to ignore when :code:`target_columns` is not provided.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles using :func:`filter_invalid_smiles`.
    :param args: Arguments, either :class:`~chemprop.args.TrainArgs` or :class:`~chemprop.args.PredictArgs`.
    :param features_path: A list of paths to files containing features. If provided, it is used
                          in place of :code:`args.features_path`.
    :param features_generator: A list of features generators to use. If provided, it is used
                               in place of :code:`args.features_generator`.
    :param atom_descriptors_path: The path to the file containing the custom atom descriptors.
    :param max_data_size: The maximum number of data points to load.
    :param logger: A logger for recording output.
    :param store_row: Whether to store the raw CSV row in each :class:`~chemprop.data.data.MoleculeDatapoint`.
    :param skip_none_targets: Whether to skip targets that are all 'None'. This is mostly relevant when --target_columns
                              are passed in, so only a subset of tasks are examined.
    :return: A :class:`~chemprop.data.MoleculeDataset` containing SMILES and target values along
             with other info such as additional features when desired.
    """
    debug = logger.debug if logger is not None else print

    # Load atomic descriptors
    atom_features = None
    atom_descriptors = None

    if args is not None:
        # Prefer explicit function arguments but default to args if not provided
        smiles_column = smiles_column if smiles_column is not None else args.smiles_column
        target_columns = target_columns if target_columns is not None else args.target_columns
        ignore_columns = ignore_columns if ignore_columns is not None else args.ignore_columns
        features_path = features_path if features_path is not None else args.features_path
        features_generator = features_generator if features_generator is not None else args.features_generator
        atom_descriptors_path = atom_descriptors_path if atom_descriptors_path is not None \
            else args.atom_descriptors_path
        max_data_size = max_data_size if max_data_size is not None else args.max_data_size

        if args.atom_descriptors == 'feature':
            atom_features = load_atom_features(atom_descriptors_path)
        elif args.atom_descriptors == 'descriptor':
            atom_descriptors = load_atom_features(atom_descriptors_path)

    max_data_size = max_data_size or float('inf')

    # Load features
    if features_path is not None:
        features_data = []
        for feat_path in features_path:
            features_data.append(
                load_features(feat_path))  # each is num_data x num_features
        features_data = np.concatenate(features_data, axis=1)
    else:
        features_data = None

    skip_smiles = set()

    # Load data
    with open(path) as f:
        reader = csv.DictReader(f)
        columns = reader.fieldnames

        # By default, the SMILES column is the first column
        if smiles_column is None:
            smiles_column = columns[0]

        # By default, the targets columns are all the columns except the SMILES column
        if target_columns is None:
            ignore_columns = set([smiles_column] + (
                [] if ignore_columns is None else ignore_columns))
            target_columns = [
                column for column in columns if column not in ignore_columns
            ]

        all_smiles, all_targets, all_rows, all_features = [], [], [], []
        for i, row in tqdm(enumerate(reader)):
            smiles = row[smiles_column]

            if smiles in skip_smiles:
                continue

            targets = [
                float(row[column]) if row[column] != '' else None
                for column in target_columns
            ]

            # Check whether all targets are None and skip if so
            if skip_none_targets and all(x is None for x in targets):
                continue

            all_smiles.append(smiles)
            all_targets.append(targets)

            if features_data is not None:
                all_features.append(features_data[i])

            if store_row:
                all_rows.append(row)

            if len(all_smiles) >= max_data_size:
                break

        data = MoleculeDataset([
            MoleculeDatapoint(
                smiles=smiles,
                targets=targets,
                row=all_rows[i] if store_row else None,
                features_generator=features_generator,
                features=all_features[i]
                if features_data is not None else None,
                atom_features=atom_features[i]
                if atom_features is not None else None,
                atom_descriptors=atom_descriptors[i]
                if atom_descriptors is not None else None,
            ) for i, (smiles, targets) in tqdm(
                enumerate(zip(all_smiles, all_targets)), total=len(all_smiles))
        ])

    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        data = filter_invalid_smiles(data)

        if len(data) < original_data_len:
            debug(
                f'Warning: {original_data_len - len(data)} SMILES are invalid.'
            )

    return data
Exemplo n.º 5
0
def get_data(path: str,
             skip_invalid_smiles: bool = True,
             args: Namespace = None,
             features_path: List[str] = None,
             max_data_size: int = None,
             use_compound_names: bool = None,
             logger: Logger = None) -> MoleculeDataset:
    """
    Gets smiles string and target values (and optionally compound names if provided) from a CSV file.

    :param path: Path to a CSV file.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles.
    :param args: Arguments.
    :param features_path: A list of paths to files containing features. If provided, it is used
    in place of args.features_path.
    :param max_data_size: The maximum number of data points to load.
    :param use_compound_names: Whether file has compound names in addition to smiles strings.
    :param logger: Logger.
    :return: A MoleculeDataset containing smiles strings and target values along
    with other info such as additional features and compound names when desired.
    """
    debug = logger.debug if logger is not None else print

    if args is not None:
        # Prefer explicit function arguments but default to args if not provided
        features_path = features_path if features_path is not None else args.features_path
        max_data_size = max_data_size if max_data_size is not None else args.max_data_size
        use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names
    else:
        use_compound_names = False

    max_data_size = max_data_size or float('inf')

    # Load features
    if features_path is not None:
        features_data = []
        for feat_path in features_path:
            # each is num_data x num_features
            features_data.append(load_features(feat_path))
        features_data = np.concatenate(features_data, axis=1)
    else:
        features_data = None

    skip_smiles = set()

    # Load data
    with open(path) as f:
        reader = csv.reader(f)
        next(reader)  # skip header

        lines = []
        for line in reader:
            smiles = line[0]

            if smiles in skip_smiles:
                continue

            lines.append(line)

            if len(lines) >= max_data_size:
                break
        middle = []
        for i, line in tqdm(enumerate(lines), total=len(lines)):
            temp = MoleculeDatapoint(line=line,
                                     args=args,
                                     features=features_data[i]
                                     if features_data is not None else None,
                                     use_compound_names=use_compound_names)
            middle.append(temp)
        data = MoleculeDataset(middle)
        ''''
        data = MoleculeDataset([
            MoleculeDatapoint(
                line=line,
                args=args,
                features=features_data[i] if features_data is not None else None,
                use_compound_names=use_compound_names
            ) for i, line in tqdm(enumerate(lines), total=len(lines))
        ])
        '''

    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        data = filter_invalid_smiles(data)

        if len(data) < original_data_len:
            debug(
                f'Warning: {original_data_len - len(data)} SMILES are invalid.'
            )

    if data.data[0].features is not None:
        args.features_dim = len(data.data[0].features)

    return data
Exemplo n.º 6
0
def get_data(path: str,
             smiles_column: str = None,
             target_columns: List[str] = None,
             skip_invalid_smiles: bool = True,
             args: Union[PredictArgs, TrainArgs] = None,
             features_path: List[str] = None,
             features_generator: List[str] = None,
             max_data_size: int = None,
             logger: Logger = None) -> MoleculeDataset:
    """
    Gets smiles string and target values (and optionally compound names if provided) from a CSV file.

    :param path: Path to a CSV file.
    :param smiles_column: The name of the column containing SMILES strings. By default, uses the first column.
    :param target_columns: Name of the columns containing target values. By default, uses all columns except the SMILES column.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles.
    :param args: Arguments.
    :param features_path: A list of paths to files containing features. If provided, it is used
    in place of args.features_path.
    :param features_generator: A list of features generators to use. If provided, it is used
    in place of args.features_generator.
    :param max_data_size: The maximum number of data points to load.
    :param logger: Logger.
    :return: A MoleculeDataset containing smiles strings and target values along
    with other info such as additional features and compound names when desired.
    """
    debug = logger.debug if logger is not None else print

    if args is not None:
        # Prefer explicit function arguments but default to args if not provided
        smiles_column = smiles_column if smiles_column is not None else args.smiles_column
        target_columns = target_columns if target_columns is not None else args.target_columns
        features_path = features_path if features_path is not None else args.features_path
        features_generator = features_generator if features_generator is not None else args.features_generator
        max_data_size = max_data_size if max_data_size is not None else args.max_data_size

    max_data_size = max_data_size or float('inf')

    # Load features
    if features_path is not None:
        features_data = []
        for feat_path in features_path:
            features_data.append(
                load_features(feat_path))  # each is num_data x num_features
        features_data = np.concatenate(features_data, axis=1)
    else:
        features_data = None

    skip_smiles = set()

    # Load data
    with open(path) as f:
        reader = csv.DictReader(f)
        columns = reader.fieldnames

        # By default, the SMILES column is the first column
        if smiles_column is None:
            smiles_column = columns[0]

        # By default, the targets columns are all the columns except the SMILES column
        if target_columns is None:
            target_columns = [
                column for column in columns if column != smiles_column
            ]

        all_smiles, all_targets, all_rows = [], [], []
        for row in reader:
            smiles = row[smiles_column]

            if smiles in skip_smiles:
                continue

            targets = [
                float(row[column]) if row[column] != '' else None
                for column in target_columns
            ]

            all_smiles.append(smiles)
            all_targets.append(targets)
            all_rows.append(row)

            if len(all_smiles) >= max_data_size:
                break

        data = MoleculeDataset([
            MoleculeDatapoint(smiles=smiles,
                              targets=targets,
                              row=row,
                              features_generator=features_generator,
                              features=features_data[i]
                              if features_data is not None else None)
            for i, (
                smiles, targets,
                row) in tqdm(enumerate(zip(all_smiles, all_targets, all_rows)),
                             total=len(all_smiles))
        ])

    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        data = filter_invalid_smiles(data)

        if len(data) < original_data_len:
            debug(
                f'Warning: {original_data_len - len(data)} SMILES are invalid.'
            )

    return data
Exemplo n.º 7
0
def get_data(path: str,
             smiles_columns: Union[str, List[str]] = None,
             target_columns: List[str] = None,
             ignore_columns: List[str] = None,
             skip_invalid_smiles: bool = True,
             args: Union[TrainArgs, PredictArgs] = None,
             data_weights_path: str = None,
             features_path: List[str] = None,
             features_generator: List[str] = None,
             phase_features_path: str = None,
             atom_descriptors_path: str = None,
             bond_features_path: str = None,
             max_data_size: int = None,
             store_row: bool = False,
             logger: Logger = None,
             loss_function: str = None,
             skip_none_targets: bool = False) -> MoleculeDataset:
    """
    Gets SMILES and target values from a CSV file.

    :param path: Path to a CSV file.
    :param smiles_columns: The names of the columns containing SMILES.
                           By default, uses the first :code:`number_of_molecules` columns.
    :param target_columns: Name of the columns containing target values. By default, uses all columns
                           except the :code:`smiles_column` and the :code:`ignore_columns`.
    :param ignore_columns: Name of the columns to ignore when :code:`target_columns` is not provided.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles using :func:`filter_invalid_smiles`.
    :param args: Arguments, either :class:`~chemprop.args.TrainArgs` or :class:`~chemprop.args.PredictArgs`.
    :param data_weights_path: A path to a file containing weights for each molecule in the loss function.
    :param features_path: A list of paths to files containing features. If provided, it is used
                          in place of :code:`args.features_path`.
    :param features_generator: A list of features generators to use. If provided, it is used
                               in place of :code:`args.features_generator`.
    :param phase_features_path: A path to a file containing phase features as applicable to spectra.
    :param atom_descriptors_path: The path to the file containing the custom atom descriptors.
    :param bond_features_path: The path to the file containing the custom bond features.
    :param max_data_size: The maximum number of data points to load.
    :param logger: A logger for recording output.
    :param store_row: Whether to store the raw CSV row in each :class:`~chemprop.data.data.MoleculeDatapoint`.
    :param skip_none_targets: Whether to skip targets that are all 'None'. This is mostly relevant when --target_columns
                              are passed in, so only a subset of tasks are examined.
    :param loss_function: The loss function to be used in training.
    :return: A :class:`~chemprop.data.MoleculeDataset` containing SMILES and target values along
             with other info such as additional features when desired.
    """
    debug = logger.debug if logger is not None else print

    if args is not None:
        # Prefer explicit function arguments but default to args if not provided
        smiles_columns = smiles_columns if smiles_columns is not None else args.smiles_columns
        target_columns = target_columns if target_columns is not None else args.target_columns
        ignore_columns = ignore_columns if ignore_columns is not None else args.ignore_columns
        features_path = features_path if features_path is not None else args.features_path
        features_generator = features_generator if features_generator is not None else args.features_generator
        phase_features_path = phase_features_path if phase_features_path is not None else args.phase_features_path
        atom_descriptors_path = atom_descriptors_path if atom_descriptors_path is not None \
            else args.atom_descriptors_path
        bond_features_path = bond_features_path if bond_features_path is not None \
            else args.bond_features_path
        max_data_size = max_data_size if max_data_size is not None else args.max_data_size
        loss_function = loss_function if loss_function is not None else args.loss_function

    if not isinstance(smiles_columns, list):
        smiles_columns = preprocess_smiles_columns(path=path, smiles_columns=smiles_columns)

    max_data_size = max_data_size or float('inf')

    # Load features
    if features_path is not None:
        features_data = []
        for feat_path in features_path:
            features_data.append(load_features(feat_path))  # each is num_data x num_features
        features_data = np.concatenate(features_data, axis=1)
    else:
        features_data = None
        
    if phase_features_path is not None:
        phase_features = load_features(phase_features_path)
        for d_phase in phase_features:
            if not (d_phase.sum() == 1 and np.count_nonzero(d_phase) == 1):
                raise ValueError('Phase features must be one-hot encoded.')
        if features_data is not None:
            features_data = np.concatenate((features_data,phase_features), axis=1)
        else: # if there are no other molecular features, phase features become the only molecular features
            features_data = np.array(phase_features)
    else:
        phase_features = None

    # Load data weights
    if data_weights_path is not None:
        data_weights = get_data_weights(data_weights_path)
    else:
        data_weights = None

    # By default, the targets columns are all the columns except the SMILES column
    if target_columns is None:
        target_columns = get_task_names(
            path=path,
            smiles_columns=smiles_columns,
            target_columns=target_columns,
            ignore_columns=ignore_columns,
        )

    # Find targets provided as inequalities
    if loss_function == 'bounded_mse':
        gt_targets, lt_targets = get_inequality_targets(path=path, target_columns=target_columns)
    else:
        gt_targets, lt_targets = None, None

    # Load data
    with open(path) as f:
        reader = csv.DictReader(f)

        all_smiles, all_targets, all_rows, all_features, all_phase_features, all_weights, all_gt, all_lt = [], [], [], [], [], [], [], []
        for i, row in enumerate(tqdm(reader)):
            smiles = [row[c] for c in smiles_columns]

            targets = []
            for column in target_columns:
                value = row[column]
                if value in ['','nan']:
                    targets.append(None)
                elif '>' in value or '<' in value:
                    if loss_function == 'bounded_mse':
                        targets.append(float(value.strip('<>')))
                    else:
                        raise ValueError('Inequality found in target data. To use inequality targets (> or <), the regression loss function bounded_mse must be used.')
                else:
                    targets.append(float(value))

            # Check whether all targets are None and skip if so
            if skip_none_targets and all(x is None for x in targets):
                continue

            all_smiles.append(smiles)
            all_targets.append(targets)

            if features_data is not None:
                all_features.append(features_data[i])
            
            if phase_features is not None:
                all_phase_features.append(phase_features[i])

            if data_weights is not None:
                all_weights.append(data_weights[i])

            if gt_targets is not None:
                all_gt.append(gt_targets[i])

            if lt_targets is not None:
                all_lt.append(lt_targets[i])

            if store_row:
                all_rows.append(row)

            if len(all_smiles) >= max_data_size:
                break

        atom_features = None
        atom_descriptors = None
        if args is not None and args.atom_descriptors is not None:
            try:
                descriptors = load_valid_atom_or_bond_features(atom_descriptors_path, [x[0] for x in all_smiles])
            except Exception as e:
                raise ValueError(f'Failed to load or validate custom atomic descriptors or features: {e}')

            if args.atom_descriptors == 'feature':
                atom_features = descriptors
            elif args.atom_descriptors == 'descriptor':
                atom_descriptors = descriptors

        bond_features = None
        if args is not None and args.bond_features_path is not None:
            try:
                bond_features = load_valid_atom_or_bond_features(bond_features_path, [x[0] for x in all_smiles])
            except Exception as e:
                raise ValueError(f'Failed to load or validate custom bond features: {e}')

        data = MoleculeDataset([
            MoleculeDatapoint(
                smiles=smiles,
                targets=targets,
                row=all_rows[i] if store_row else None,
                data_weight=all_weights[i] if data_weights is not None else None,
                gt_targets=all_gt[i] if gt_targets is not None else None,
                lt_targets=all_lt[i] if lt_targets is not None else None,
                features_generator=features_generator,
                features=all_features[i] if features_data is not None else None,
                phase_features=all_phase_features[i] if phase_features is not None else None,
                atom_features=atom_features[i] if atom_features is not None else None,
                atom_descriptors=atom_descriptors[i] if atom_descriptors is not None else None,
                bond_features=bond_features[i] if bond_features is not None else None,
                overwrite_default_atom_features=args.overwrite_default_atom_features if args is not None else False,
                overwrite_default_bond_features=args.overwrite_default_bond_features if args is not None else False
            ) for i, (smiles, targets) in tqdm(enumerate(zip(all_smiles, all_targets)),
                                               total=len(all_smiles))
        ])

    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        data = filter_invalid_smiles(data)

        if len(data) < original_data_len:
            debug(f'Warning: {original_data_len - len(data)} SMILES are invalid.')

    return data
Exemplo n.º 8
0
def get_data(path: str,
             skip_invalid_smiles: bool = True,
             args: Namespace = None,
             features_path: List[str] = None,
             max_data_size: int = None,
             use_compound_names: bool = False,
             logger: Logger = None) -> MoleculeDataset:
    """
    Gets smiles string and target values (and optionally compound names if provided) from a CSV file.

    :param path: Path to a CSV file.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles.
    :param args: Arguments.
    :param features_path: A list of paths to .pckl files containing features. If provided, it is used
    in place of args.features_path.
    :param max_data_size: The maximum number of data points to load.
    :param use_compound_names: Whether file has compound names in addition to smiles strings.
    :param logger: Logger.
    :return: A MoleculeDataset containing smiles strings and target values along
    with other info such as additional features and compound names when desired.
    """
    debug = logger.debug if logger is not None else print

    if args is not None:
        max_data_size = min(args.max_data_size or float('inf'), max_data_size or float('inf'))
        skip_smiles_path = args.skip_smiles_path
        features_path = features_path or args.features_path
    else:
        skip_smiles_path = None
        max_data_size = max_data_size or float('inf')

    # Load features
    if features_path is not None:
        features_data = []
        for feat_path in features_path:
            features_data.append(load_features(feat_path))  # each is num_data x num_features
        features_data = np.concatenate(features_data, axis=1)
    else:
        features_data = None

    # Load smiles to skip
    if skip_smiles_path is not None:
        with open(skip_smiles_path) as f:
            reader = csv.reader(f)
            next(reader)  # skip header
            skip_smiles = {line[0] for line in reader}
    else:
        skip_smiles = set()

    # Load data
    with open(path) as f:
        reader = csv.reader(f)
        next(reader)  # skip header

        lines = []
        for line in reader:
            smiles = line[0]

            if smiles in skip_smiles:
                continue

            lines.append(line)

            if len(lines) >= max_data_size:
                break

        data = MoleculeDataset([
            MoleculeDatapoint(
                line=line,
                args=args,
                features=features_data[i] if features_data is not None else None,
                use_compound_names=use_compound_names
            ) for i, line in tqdm(enumerate(lines), total=len(lines))
        ])

    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        data = filter_invalid_smiles(data)

        if len(data) < original_data_len:
            debug(f'Warning: {original_data_len - len(data)} SMILES are invalid.')

    if data.data[0].features is not None:
        args.features_dim = len(data.data[0].features)

    if args is not None and args.dataset_type == 'regression_with_binning':
        data = convert_to_classes(data, args.num_bins)

    return data
Exemplo n.º 9
0
def get_data(path: str,
             skip_invalid_smiles: bool = True,
             args: Namespace = None,
             features_path: List[str] = None,
             max_data_size: int = None,
             use_compound_names: bool = None,
             logger: Logger = None) -> MolPairDataset:
    """
    Gets smiles string and target values (and optionally compound names if provided) from a CSV file.

    :param path: Path to a CSV file.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles.
    :param args: Arguments.
    :param features_path: A list of paths to files containing features. If provided, it is used
    in place of args.features_path.
    :param max_data_size: The maximum number of data points to load.
    :param use_compound_names: Whether file has compound names in addition to smiles strings.
    :param logger: Logger.
    :return: A MolPairDataset containing smiles strings and target values along
    with other info such as additional features and compound names when desired.
    """
    debug = logger.debug if logger is not None else print

    if args is not None:
        # Prefer explicit function arguments but default to args if not provided
        features_path = features_path if features_path is not None else args.features_path
        max_data_size = max_data_size if max_data_size is not None else args.max_data_size
        use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names
    else:
        use_compound_names = False

    max_data_size = max_data_size or float('inf')

    # Load features
    if features_path is not None:
        for feat_path in features_path:
            debug(f'Loading features from {feat_path}')
            features_data = load_features(
                feat_path)  # Expect file to look like {SMILES: feats}
    else:
        features_data = None

    skip_smiles = set()

    # Load data
    with open(path) as f:
        f.readline()  # skip header

        lines = []  # drug_smile, cmpd_smile, targets, context
        for line in f.readlines():
            line = line.strip().split(',')
            drug_smiles = line[0]
            cmpd_smiles = line[1]
            targets, context, feats1, feats2 = [], [], [], []

            if drug_smiles in skip_smiles or cmpd_smiles in skip_smiles:
                continue

            for i in range(2, len(line)):
                if args.data_format is None or args.data_format[i] == 'P':
                    targets.append(float(line[i]) if line[i] != '' else None)
                elif args.data_format[i] == '1':
                    feats1.append(float(line[i]) if line[i] != '' else np.nan)
                elif args.data_format[i] == '2':
                    feats2.append(float(line[i]) if line[i] != '' else np.nan)
                else:
                    context.append(float(line[i]) if line[i] != '' else np.nan)
            lines.append(
                (drug_smiles, cmpd_smiles, targets, feats1, feats2, context))

            if len(lines) >= max_data_size:
                break

        data = MolPairDataset([
            MolPairDatapoint(
                drug_smiles=line[0],
                cmpd_smiles=line[1],
                targets=line[2],
                args=args,
                drug_feats=features_helper(line[0], features_data, line[3]),
                cmpd_feats=features_helper(line[1], features_data, line[4]),
                context=np.array(line[5]) if len(line[5]) > 0 else None)
            for i, line in tqdm(enumerate(lines), total=len(lines))
        ])

    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        data = filter_invalid_smiles(data)

        if len(data) < original_data_len:
            debug(
                f'Warning: {original_data_len - len(data)} SMILES are invalid.'
            )

    if args.use_input_features:
        args.features_dim = data.features_size()

    return data
Exemplo n.º 10
0
def get_data(
    path: str,
    smiles_columns: Union[str, List[str]] = None,
    target_columns: List[str] = None,
    ignore_columns: List[str] = None,
    skip_invalid_smiles: bool = True,
    args: Union[TrainArgs, PredictArgs] = None,
    features_path: List[str] = None,
    features_generator: List[str] = None,
    atom_descriptors_path: str = None,
    bond_features_path: str = None,
    max_data_size: int = None,
    store_row: bool = False,
    logger: Logger = None,
    skip_none_targets: bool = False,
) -> MoleculeDataset:
    """
    Gets SMILES and target values from a CSV file.

    :param path: Path to a CSV file.
    :param smiles_columns: The names of the columns containing SMILES.
                           By default, uses the first :code:`number_of_molecules` columns.
    :param target_columns: Name of the columns containing target values. By default, uses all columns
                           except the :code:`smiles_column` and the :code:`ignore_columns`.
    :param ignore_columns: Name of the columns to ignore when :code:`target_columns` is not provided.
    :param skip_invalid_smiles: Whether to skip and filter out invalid smiles using :func:`filter_invalid_smiles`.
    :param args: Arguments, either :class:`~chemprop.args.TrainArgs` or :class:`~chemprop.args.PredictArgs`.
    :param features_path: A list of paths to files containing features. If provided, it is used
                          in place of :code:`args.features_path`.
    :param features_generator: A list of features generators to use. If provided, it is used
                               in place of :code:`args.features_generator`.
    :param atom_descriptors_path: The path to the file containing the custom atom descriptors.
    :param bond_features_path: The path to the file containing the custom bond features.
    :param max_data_size: The maximum number of data points to load.
    :param logger: A logger for recording output.
    :param store_row: Whether to store the raw CSV row in each :class:`~chemprop.data.data.MoleculeDatapoint`.
    :param skip_none_targets: Whether to skip targets that are all 'None'. This is mostly relevant when --target_columns
                              are passed in, so only a subset of tasks are examined.
    :return: A :class:`~chemprop.data.MoleculeDataset` containing SMILES and target values along
             with other info such as additional features when desired.
    """
    debug = logger.debug if logger is not None else print

    if args is not None:
        # Prefer explicit function arguments but default to args if not provided
        smiles_columns = smiles_columns if smiles_columns is not None else args.smiles_columns
        target_columns = target_columns if target_columns is not None else args.target_columns
        ignore_columns = ignore_columns if ignore_columns is not None else args.ignore_columns
        features_path = features_path if features_path is not None else args.features_path
        features_generator = (features_generator if features_generator
                              is not None else args.features_generator)
        atom_descriptors_path = (atom_descriptors_path if atom_descriptors_path
                                 is not None else args.atom_descriptors_path)
        bond_features_path = (bond_features_path if bond_features_path
                              is not None else args.bond_features_path)
        max_data_size = max_data_size if max_data_size is not None else args.max_data_size

    if not isinstance(smiles_columns, list):
        smiles_columns = preprocess_smiles_columns(
            path=path, smiles_columns=smiles_columns)

    max_data_size = max_data_size or float("inf")

    # Load features
    if features_path is not None:
        features_data = []
        for feat_path in features_path:
            features_data.append(
                load_features(feat_path))  # each is num_data x num_features
        features_data = np.concatenate(features_data, axis=1)
    else:
        features_data = None

    # Load data
    with open(path) as f:
        reader = csv.DictReader(f)

        # By default, the targets columns are all the columns except the SMILES column
        if target_columns is None:
            target_columns = get_task_names(
                path=path,
                smiles_columns=smiles_columns,
                target_columns=target_columns,
                ignore_columns=ignore_columns,
            )

        all_smiles, all_targets, all_rows, all_features = [], [], [], []
        for i, row in enumerate(tqdm(reader)):
            smiles = [row[c] for c in smiles_columns]

            targets = [
                float(row[column]) if row[column] != "" else None
                for column in target_columns
            ]

            # Check whether all targets are None and skip if so
            if skip_none_targets and all(x is None for x in targets):
                continue

            all_smiles.append(smiles)
            all_targets.append(targets)

            if features_data is not None:
                all_features.append(features_data[i])

            if store_row:
                all_rows.append(row)

            if len(all_smiles) >= max_data_size:
                break

        atom_features = None
        atom_descriptors = None
        if args is not None and args.atom_descriptors is not None:
            try:
                descriptors = load_valid_atom_or_bond_features(
                    atom_descriptors_path, [x[0] for x in all_smiles])
            except Exception as e:
                raise ValueError(
                    f"Failed to load or validate custom atomic descriptors or features: {e}"
                )

            if args.atom_descriptors == "feature":
                atom_features = descriptors
            elif args.atom_descriptors == "descriptor":
                atom_descriptors = descriptors

        bond_features = None
        if args is not None and args.bond_features_path is not None:
            try:
                bond_features = load_valid_atom_or_bond_features(
                    bond_features_path, [x[0] for x in all_smiles])
            except Exception as e:
                raise ValueError(
                    f"Failed to load or validate custom bond features: {e}")

        data = MoleculeDataset([
            MoleculeDatapoint(
                smiles=smiles,
                targets=targets,
                row=all_rows[i] if store_row else None,
                features_generator=features_generator,
                features=all_features[i]
                if features_data is not None else None,
                atom_features=atom_features[i]
                if atom_features is not None else None,
                atom_descriptors=atom_descriptors[i]
                if atom_descriptors is not None else None,
                bond_features=bond_features[i]
                if bond_features is not None else None,
                overwrite_default_atom_features=args.
                overwrite_default_atom_features if args is not None else False,
                overwrite_default_bond_features=args.
                overwrite_default_bond_features if args is not None else False,
            ) for i, (smiles, targets) in tqdm(
                enumerate(zip(all_smiles, all_targets)), total=len(all_smiles))
        ])

    # Filter out invalid SMILES
    if skip_invalid_smiles:
        original_data_len = len(data)
        data = filter_invalid_smiles(data)

        if len(data) < original_data_len:
            debug(
                f"Warning: {original_data_len - len(data)} SMILES are invalid."
            )

    return data
Exemplo n.º 11
0
def get_data(path: str,
             args: Namespace = None,
             max_data_size: int = None,
             use_compound_names: bool = False) -> MoleculeDataset:
    """
    Gets smiles string and target values (and optionally compound names if provided) from a CSV file.

    :param path: Path to a CSV file.
    :param args: Arguments.
    :param max_data_size: The maximum number of data points to load.
    :param use_compound_names: Whether file has compound names in addition to smiles strings.
    :return: A MoleculeDataset containing smiles strings and target values along
    with other info such as additional features and compound names when desired.
    """
    if args is not None:
        max_data_size = min(args.max_data_size or float('inf'), max_data_size
                            or float('inf'))
        skip_smiles_path = args.skip_smiles_path

        if args.features_path:
            features_data = []
            for features_path in args.features_path:
                features_data.append(load_features(
                    features_path))  # each is num_data x num_features
            features_data = np.concatenate(features_data, axis=1)
        else:
            features_data = None
    else:
        features_data = skip_smiles_path = None
        max_data_size = max_data_size or float('inf')

    # Load smiles to skip
    if skip_smiles_path is not None:
        with open(skip_smiles_path) as f:
            f.readline()  # skip header
            skip_smiles = {line.split(',')[0] for line in f}
    else:
        skip_smiles = set()

    # Load data
    with open(path) as f:
        f.readline()  # skip header
        lines = []
        for line in f:
            smiles = line.split(',')[0]

            if smiles == '':
                line = 'C' + line  # bandage for if your dataset has an empty line on rare occasions

            if smiles in skip_smiles:
                continue

            lines.append(line)

            if len(lines) >= max_data_size:
                break

        data = MoleculeDataset([
            MoleculeDatapoint(
                line=line.strip().split(','),
                args=args,
                features=features_data[i]
                if features_data is not None else None,
                use_compound_names=use_compound_names,
            ) for i, line in tqdm(enumerate(lines), total=len(lines))
        ])

    if data.data[0].features is not None:
        args.features_dim = len(data.data[0].features)

    if args is not None and args.dataset_type == 'regression_with_binning':
        data = convert_to_classes(data, args.num_bins)

    return data