Пример #1
0
    def _load(self):

        logger.info("Start loading dataset")

        # read label and feature file
        raw_labels = yaml_load(self.raw_labels)
        if self.extra_features is not None:
            features = yaml_load(self.extra_features)
        else:
            features = [None] * len(raw_labels)

        # build graph for mols from sdf file
        supp = Chem.SDMolSupplier(self.molecules,
                                  sanitize=True,
                                  removeHs=False)
        species = get_dataset_species(self.molecules)

        self.graphs = []
        self.labels = []
        for i, mol in enumerate(supp):
            if i % 100 == 0:
                logger.info("Processing molecule {}/{}".format(
                    i, len(raw_labels)))

            # bad mol
            if mol is None:
                continue

            # graph
            g = self.grapher.build_graph_and_featurize(
                mol, extra_feats_info=features[i], dataset_species=species)
            # add this for check purpose; some entries in the sdf file may fail
            g.graph_id = i
            self.graphs.append(g)

            # label
            bonds_class = torch.tensor(raw_labels[i], dtype=torch.int64)
            label = {"value": bonds_class, "id": i}
            self.labels.append(label)

        # Should after grapher.build_graph_and_featurize, which initializes the
        # feature name and size
        self._feature_name = self.grapher.feature_name
        self._feature_size = self.grapher.feature_size

        logger.info("Feature name: {}".format(self.feature_name))
        logger.info("Feature size: {}".format(self.feature_size))

        # feature transformers
        if self.feature_transformer:
            feature_scaler = HeteroGraphFeatureStandardScaler()
            self.graphs = feature_scaler(self.graphs)

            logger.info("Feature scaler mean: {}".format(feature_scaler.mean))
            logger.info("Feature scaler std: {}".format(feature_scaler.std))

        logger.info("Finish loading {} graphs...".format(len(self.labels)))
Пример #2
0
def _read_molecules(molecule_file, molecule_attributes_file):

    # read rdkit mols
    rdkit_mols = read_rdkit_mols_from_file(molecule_file)

    # read molecule attributes
    attrs = yaml_load(molecule_attributes_file)
    msg = (
        f"expect the number of molecules given in {molecule_file} "
        f"and the number of molecule attributes given in {molecule_attributes_file} to "
        f"be the same, but got {len(rdkit_mols)} and f{len(attrs)}. ")
    assert len(rdkit_mols) == len(attrs), msg

    # convert rdkit mols to wrapper molecules
    identifiers = [
        m.GetProp("_Name") + f"_index-{i}" if m is not None else None
        for i, m in enumerate(rdkit_mols)
    ]
    charges = [a["charge"] for a in attrs]
    molecules = [
        rdkit_mol_to_wrapper_mol(m,
                                 charge=cg,
                                 free_energy=None,
                                 identifier=idx)
        for m, idx, cg in zip(rdkit_mols, identifiers, charges)
    ]

    return molecules
Пример #3
0
    def _load(self):

        logger.info("Start loading dataset")

        # read label and feature file
        raw_value, raw_indicator, raw_mol_source = self._read_label_file()
        if self.extra_features is not None:
            features = yaml_load(self.extra_features)
        else:
            features = [None] * len(raw_value)

        # build graph for mols from sdf file
        molecules = self.get_molecules(self.molecules)
        species = get_dataset_species(molecules)

        self.graphs = []
        self.labels = []
        for i, mol in enumerate(molecules):
            if i % 100 == 0:
                logger.info("Processing molecule {}/{}".format(i, len(raw_value)))

            # bad mol
            if mol is None:
                continue

            # graph
            g = self.grapher.build_graph_and_featurize(
                mol, extra_feats_info=features[i], dataset_species=species
            )
            # we add this for check purpose, because some entries in the sdf file may fail
            g.graph_id = i
            self.graphs.append(g)

            # label
            bonds_class = torch.tensor(raw_value[i], dtype=torch.int64)
            bonds_indicator = int(raw_indicator[i])
            bonds_mol_source = raw_mol_source[i]
            label = {
                "value": bonds_class,  # torch.int64
                "indicator": bonds_indicator,  # int
                "id": bonds_mol_source,  # str
            }
            self.labels.append(label)

        # Should after grapher.build_graph_and_featurize, which initializes the
        # feature name and size
        self._feature_name = self.grapher.feature_name
        self._feature_size = self.grapher.feature_size

        logger.info("Feature name: {}".format(self.feature_name))
        logger.info("Feature size: {}".format(self.feature_size))

        # feature transformers
        if self.feature_transformer:
            feature_scaler = HeteroGraphFeatureStandardScaler()
            self.graphs = feature_scaler(self.graphs)
            logger.info("Feature scaler mean: {}".format(feature_scaler.mean))
            logger.info("Feature scaler std: {}".format(feature_scaler.std))

        logger.info("Finish loading {} graphs...".format(len(self.labels)))
Пример #4
0
    def write_results(self, predictions, filename="bed_result.yaml"):
        """
        Add prediction value as 'prediction' of each reaction (given by a dict) in the
        label file.
        """
        labels = yaml_load(self.label_file)

        failed = []
        for d, p in zip(labels, predictions):
            if p is None:
                failed.append(str(d["id"]))
                d["prediction"] = p
            else:
                d["prediction"] = float(p)

        # if any failed
        if failed:
            msg = ", ".join(failed)
            print(
                f"These reactions failed when converting their molecules, and therefore "
                f"predictions for them are not made: {msg}")

        filename = to_path(filename) if filename is not None else filename
        if filename is not None:
            yaml_dump(labels, filename)
        else:
            print(labels)
Пример #5
0
    def read_molecules(self):

        if self.format == "graph":

            if self.charge_file is not None:
                warnings.warn(
                    f"charge file {self.charge_file} ignored for format `graph`"
                )

            file_type = self.molecule_file.suffix
            if file_type == ".json":
                with open(self.molecule_file, "r") as f:
                    mol_graph_dicts = json.load(f)
            elif file_type in [".yaml", ".yml"]:
                mol_graph_dicts = yaml_load(self.molecule_file)
            else:
                supported = [".json", ".yaml", ".yml"]
                raise ValueError(
                    f"File extension of {self.molecule_file} not supported; "
                    f"supported are: {supported}.")

            mol_graphs = [MoleculeGraph.from_dict(d) for d in mol_graph_dicts]
            molecules = [
                MoleculeWrapper(g, id=str(i)) for i, g in enumerate(mol_graphs)
            ]

        else:
            # read rdkit mols
            rdkit_mols = read_rdkit_mols_from_file(self.molecule_file,
                                                   self.format)

            # read charge file
            if self.charge_file is None:
                charges = [0] * len(rdkit_mols)
            else:
                charges = read_charge(self.charge_file)
                msg = (
                    f"expect the number of molecules given in {self.molecule_file} "
                    f"and the number of charges given in {self.charge_file} to be "
                    f"the same, but got {len(rdkit_mols)} and f{len(charges)}. "
                )
                assert len(rdkit_mols) == len(charges), msg

            # convert rdkit mols to wrapper molecules
            identifiers = [
                m.GetProp("_Name") + f"_index-{i}" if m is not None else None
                for i, m in enumerate(rdkit_mols)
            ]
            molecules = rdkit_mols_to_wrapper_mols(rdkit_mols,
                                                   identifiers,
                                                   charges,
                                                   nprocs=self.nprocs)

        self._molecules = molecules

        return molecules
Пример #6
0
def _check_charge(model_path, features):
    if isinstance(features, (str, Path)):
        features = yaml_load(features)
    charges = set([x["charge"] for x in features])

    model_info = get_model_info(model_path)
    allowed_charge = set(model_info["allowed_charge"])

    if not charges.issubset(allowed_charge):
        raise ValueError(
            f"Supported molecular charges include {allowed_charge}; "
            f"but the dataset contains molecules of charge {charges}.")
def get_charges(label_file, feature_file):
    """
    Charge of reactant and products molecule in each reaction.
    """
    labels = yaml_load(label_file)
    features = yaml_load(feature_file)

    ids = []
    num_prdts = []
    rct_charges = []
    prdt1_charges = []
    prdt2_charges = []

    for lb in labels:
        ids.append(lb["id"])
        rct_idx = lb["reactants"][0]
        prdts = lb["products"]

        N = len(prdts)
        num_prdts.append(N)

        rct_charges.append(features[rct_idx]["charge"])
        prdt1_idx = prdts[0]
        prdt1_charges.append(features[prdt1_idx]["charge"])
        if N == 2:
            prdt2_idx = prdts[1]
            prdt2_charges.append(features[prdt2_idx]["charge"])
        else:
            prdt2_charges.append(None)

    df = pd.DataFrame({
        "identifier": ids,
        "num products": num_prdts,
        "charge": rct_charges,
        "product1 charge": prdt1_charges,
        "product2 charge": prdt2_charges,
    })
    return df
Пример #8
0
def _read_reactions(molecules, reaction_file):

    # read reaction file
    rxns = yaml_load(reaction_file)

    # convert to reactions
    bad_mol_indices = {i for i, m in enumerate(molecules) if m is None}

    reactions = []
    no_result_reason = []  # each element is a tuple (fail, failing_reason)

    for i, rxn in enumerate(rxns):
        idx_r = rxn["reactants"][0]
        idx_p1 = rxn["products"][0]
        if len(rxn["products"]) == 1:
            idx_p2 = None
        else:
            idx_p2 = rxn["products"][1]
        bde = rxn["energy"]

        for idx in (idx_r, idx_p1, idx_p2):
            if idx in bad_mol_indices:
                no_result_reason.append((True, idx))
                break
        else:
            reactants = [molecules[idx_r]]
            products = [molecules[idx_p1]]
            if idx_p2 is not None:
                products.append(molecules[idx_p2])

            reactions.append(
                Reaction(
                    reactants=reactants,
                    products=products,
                    broken_bond=None,
                    free_energy=bde,
                    identifier=f"reaction_{i}",
                ))

            no_result_reason.append((False, None))

    for i, reason in enumerate(no_result_reason):
        if reason[0]:
            logger.warning(
                f"Reaction {i} ignored because failed to read molecule {reason[1]}."
            )

    return reactions
Пример #9
0
def _check_charge(model_path, features):
    if isinstance(features, (str, Path)):
        features = yaml_load(features)
    charges = set([x["charge"] for x in features])

    model_info = get_model_info(model_path)
    allowed_charge = set(model_info["allowed_charge"])

    if not charges.issubset(allowed_charge):
        raise ValueError(
            f"Supported molecular charges include {allowed_charge}; "
            f"but the dataset contains molecules of charge {charges}."
            f"Note that two models trained on different datasets are provided: "
            f"the `pubchem` supports neutral molecules and the `bdncm` supports "
            f"molecules of charge -1, 0, and 1. "
            f"You may want to switch the model if you see this message.")
Пример #10
0
 def get_features(features):
     if isinstance(features, Path):
         features = yaml_load(features)
     return features
Пример #11
0
 def get_labels(labels):
     if isinstance(labels, Path):
         labels = yaml_load(labels)
     return labels
Пример #12
0
    def _load(self):

        logger.info("Start loading dataset")

        # read label and feature file
        raw_labels = yaml_load(self.raw_labels)
        if self.extra_features is not None:
            features = yaml_load(self.extra_features)
        else:
            features = [None] * len(raw_labels)

        # build graph for mols from sdf file
        molecules = self.get_molecules(self.molecules)
        species = get_dataset_species(molecules)

        graphs = []
        for i, (mol, feats) in enumerate(zip(molecules, features)):
            if i % 100 == 0:
                logger.info(f"Processing molecule {i}/{len(raw_labels)}")

            if mol is not None:
                g = self.grapher.build_graph_and_featurize(
                    mol, extra_feats_info=feats, dataset_species=species
                )
                # add this for check purpose; some entries in the sdf file may fail
                g.graph_id = i
            else:
                g = None
            graphs.append(g)

        # Should after grapher.build_graph_and_featurize, which initializes the
        # feature name and size
        self._feature_name = self.grapher.feature_name
        self._feature_size = self.grapher.feature_size

        logger.info("Feature name: {}".format(self.feature_name))
        logger.info("Feature size: {}".format(self.feature_size))

        # regroup graphs to reactions
        num_mols = [lb["num_mols"] for lb in raw_labels]
        reactions = list_split_by_size(graphs, num_mols)

        # global feat mapping
        global_mapping = [[{0: 0} for _ in range(n)] for n in num_mols]

        self.graphs = []
        self.labels = []
        for rxn, lb, gmp in zip(reactions, raw_labels, global_mapping):
            if None not in rxn:
                lb["value"] = torch.tensor(lb["value"], dtype=getattr(torch, self.dtype))
                lb["global_mapping"] = gmp
                self.graphs.append(rxn)
                self.labels.append(lb)

        # transformers
        if self.feature_transformer:
            graphs = list(itertools.chain.from_iterable(self.graphs))  # flatten the list
            feature_scaler = HeteroGraphFeatureStandardScaler()
            graphs = feature_scaler(graphs)
            num_mols = [len(rxn) for rxn in self.graphs]
            self.graphs = list_split_by_size(graphs, num_mols)
            logger.info("Feature scaler mean: {}".format(feature_scaler.mean))
            logger.info("Feature scaler std: {}".format(feature_scaler.std))

        if self.label_transformer:

            # normalization
            values = [lb["value"] for lb in self.labels]  # list of 0D tensor
            # np and torch compute slightly differently std (depending on `ddof` of np)
            # here we choose to use np
            mean = float(np.mean(values))
            std = float(np.std(values))
            values = (torch.stack(values) - mean) / std
            std = torch.tensor(std, dtype=getattr(torch, self.dtype))
            mean = torch.tensor(mean, dtype=getattr(torch, self.dtype))

            # update label
            for i, lb in enumerate(values):
                self.labels[i]["value"] = lb
                self.labels[i]["scaler_mean"] = mean
                self.labels[i]["scaler_stdev"] = std

            logger.info("Label scaler mean: {}".format(mean))
            logger.info("Label scaler std: {}".format(std))

        logger.info("Finish loading {} reactions...".format(len(self.labels)))
Пример #13
0
    def _load(self):

        logger.info("Start loading dataset")

        # read label and feature file
        raw_labels, extensive = self._read_label_file()
        if self.extra_features is not None:
            features = yaml_load(self.extra_features)
        else:
            features = [None] * len(raw_labels)

        # build graph for mols from sdf file
        molecules = self.get_molecules(self.molecules)
        species = get_dataset_species(molecules)

        self.graphs = []
        self.labels = []
        natoms = []
        for i, (mol, feats, lb) in enumerate(zip(molecules, features, raw_labels)):

            if i % 100 == 0:
                logger.info("Processing molecule {}/{}".format(i, len(raw_labels)))

            if mol is None:
                continue

            # graph
            g = self.grapher.build_graph_and_featurize(
                mol, extra_feats_info=feats, dataset_species=species
            )
            # we add this for check purpose, because some entries in the sdf file may fail
            g.graph_id = i
            self.graphs.append(g)

            # label
            lb = torch.tensor(lb, dtype=getattr(torch, self.dtype))
            self.labels.append({"value": lb, "id": i})

            natoms.append(mol.GetNumAtoms())

        # this should be called after grapher.build_graph_and_featurize,
        # which initializes the feature name and size
        self._feature_name = self.grapher.feature_name
        self._feature_size = self.grapher.feature_size
        logger.info("Feature name: {}".format(self.feature_name))
        logger.info("Feature size: {}".format(self.feature_size))

        # feature and label transformer
        if self.feature_transformer:
            feature_scaler = HeteroGraphFeatureStandardScaler()
            self.graphs = feature_scaler(self.graphs)
            logger.info("Feature scaler mean: {}".format(feature_scaler.mean))
            logger.info("Feature scaler std: {}".format(feature_scaler.std))

        if self.label_transformer:
            labels = np.asarray([lb["value"].numpy() for lb in self.labels])
            natoms = np.asarray(natoms, dtype=np.float32)

            scaled_labels = []
            scaler_mean = []
            scaler_std = []

            label_scaler_mean = []
            label_scaler_std = []

            for i, is_ext in enumerate(extensive):
                if is_ext:
                    # extensive labels standardized by the number of atoms in the
                    # molecules, i.e. y' = y/natoms
                    lb = labels[:, i] / natoms
                    mean = np.zeros(len(lb))
                    std = natoms
                    label_scaler_mean.append(None)
                    label_scaler_std.append("num atoms")
                else:
                    # intensive labels standardized by y' = (y - mean(y))/std(y)
                    scaler = StandardScaler()
                    lb = labels[:, [i]]  # 2D array of shape (N, 1)
                    lb = scaler(lb)
                    lb = lb.ravel()
                    mean = np.repeat(scaler.mean, len(lb))
                    std = np.repeat(scaler.std, len(lb))
                    label_scaler_mean.append(scaler.mean)
                    label_scaler_std.append(scaler.std)
                scaled_labels.append(lb)
                scaler_mean.append(mean)
                scaler_std.append(std)

            scaled_labels = torch.tensor(
                np.asarray(scaled_labels).T, dtype=getattr(torch, self.dtype)
            )
            scaler_mean = torch.tensor(
                np.asarray(scaler_mean).T, dtype=getattr(torch, self.dtype)
            )
            scaler_std = torch.tensor(
                np.asarray(scaler_std).T, dtype=getattr(torch, self.dtype)
            )

            for i, (lb, m, s) in enumerate(zip(scaled_labels, scaler_mean, scaler_std)):
                self.labels[i]["value"] = lb
                self.labels[i]["scaler_mean"] = m
                self.labels[i]["scaler_stdev"] = s

            logger.info("Label scaler mean: {}".format(label_scaler_mean))
            logger.info("Label scaler std: {}".format(label_scaler_std))

        logger.info("Finish loading {} labels...".format(len(self.labels)))
Пример #14
0
def get_model_info(model_path):
    path = model_path.joinpath("model_info.yaml")
    return yaml_load(path)
Пример #15
0
 def get_extra_features(fname):
     return yaml_load(fname)