Exemplo n.º 1
0
    def __init__(
        self,
        optimizer_names: Optional[List[str]] = None,
        log_scale: bool = False,
        quantile_width: float = 0.5,
        show_extrama: bool = True,
        **kwargs,
    ):
        self._optimizer_names = params.optional_(
            optimizer_names, lambda arg: params.sequence(arg, type_=str)
        )
        self._show_extrema = params.boolean(show_extrama)
        log_scale = params.boolean(log_scale)
        scale = "log" if log_scale else "linear"

        self._quantile_width = params.real(quantile_width, from_=0, to=1)

        kwargs["axes_scales"] = kwargs.get("axes_scales", (scale, "linear"))
        kwargs["axes_labels"] = kwargs.get(
            "axes_labels", ("function evaluations", "best score", None, None)
        )
        kwargs["rectify"] = False
        kwargs["visualization_type"] = "shaded-line"

        super().__init__(**kwargs)
Exemplo n.º 2
0
    def __init__(self,
                 internal_hp_optimization: bool = True,
                 kernel: Optional[Kernel] = None,
                 alpha: Union[float, Sequence] = 1e-5,
                 optimizer="fmin_l_bfgs_b",
                 n_restarts_optimizer=0,
                 normalize_y=False,
                 random_state: int = None,
                 **kwargs):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            internal_hp_optimization: if True, hyperparameters are optimized "internally"
                by the Gaussian process, that is, scikit-learn optimizes hyperparameters
                and for smlb the learner has no hyperparameters;
                if False, hyperparameters are optimized by smlb (and scikit-learn does
                not optimize any hyperparameters)
            kernel: scikit-learn kernel; if None, a single Gaussian kernel is used as default
            alpha: regularization constant (scalar or vector); added as-is to kernel matrix diagonal.
                   Equivalent to adding a "WhiteKernel"; the default is the corresponding value from
                   scikit-learn's WhiteKernel, and different from scikit-learn's GaussianProcessRegressor.
            optimizer: hyperparameter optimization algorithm; used only if internal_hp_optimization is True
            n_restarts_optimizer: number of times optimizer is restarted; only used if internal_hp_optimization is True
            normalize_y: whether to subtract the mean of the labels
            random_state: integer seed

        See skl.gaussian_process.GaussianProcessRegressor parameters.
        """

        super().__init__(**kwargs)

        internal_hp_optimization = params.boolean(internal_hp_optimization)
        kernel = params.any_(kernel, lambda arg: params.instance(arg, Kernel),
                             params.none)
        # incomplete check for alpha as dimension becomes known only at fitting time
        alpha = params.any_(
            alpha,
            lambda arg: params.real(arg, from_=0),
            lambda arg: params.real_vector(arg, domain=[0, np.inf]),
        )
        # todo: check optimizer, requires params.union (of string and callable) and params.function
        normalize_y = params.boolean(normalize_y)
        random_state = params.integer(random_state)

        if kernel is None:
            kernel = skl.gaussian_process.kernels.RBF(
            ) + skl.gaussian_process.kernels.WhiteKernel()

        assert internal_hp_optimization is True  # external HP optimization not yet supported

        self._model = skl.gaussian_process.GaussianProcessRegressor(
            kernel=kernel,
            alpha=alpha,
            optimizer=optimizer,
            n_restarts_optimizer=n_restarts_optimizer,
            normalize_y=normalize_y,
            random_state=random_state,
        )
Exemplo n.º 3
0
def test_boolean():
    """Tests for boolean arguments."""

    assert params.boolean(True) is True
    assert params.boolean("true") is True
    assert params.boolean("True") is True

    assert params.boolean(False) is False
    assert params.boolean("false") is False
    assert params.boolean("False") is False

    with pytest.raises(InvalidParameterError):
        params.boolean(0)
    with pytest.raises(InvalidParameterError):
        params.boolean(1.0)
Exemplo n.º 4
0
    def __init__(self,
                 labels_to_load: Optional[Union[str, List[str]]] = None,
                 ignore_dubious: bool = False):
        """Initialize Ni-superalloy dataset with specified labels.

        Parameters:
            labels_to_load (str or List[str]): which labels to load. Options are
                'Yield Strength', 'Ultimate Tensile Strength', 'Stress Rupture Time',
                'Stress Rupture Stress', and 'Elongation'.
                If None, then all labels are loaded.
            ignore_dubious: whether or not to ignore samples that have something
                questionable about them

        """

        labels_to_load = params.optional_(
            labels_to_load,
            lambda arg: params.any_(
                arg,
                params.string,
                lambda arg: params.sequence(arg, type_=str),
            ),
        )
        ignore_dubious = params.boolean(ignore_dubious)

        filepath = self.DEFAULT_PATH
        data, labels = self._load_data_and_labels(filepath, labels_to_load,
                                                  ignore_dubious)
        super().__init__(data=data, labels=labels)
Exemplo n.º 5
0
    def __init__(self, maximize: bool = True, **kwargs):
        super().__init__(**kwargs)

        maximize = params.boolean(maximize)
        if maximize:
            self._direction = 1
        else:
            self._direction = -1
Exemplo n.º 6
0
    def _intersection(lhs: "TabularData",
                      rhs: "TabularData",
                      duplicates: bool = False) -> "TabularData":
        """Specialized intersection.

        For labeled data, labels are compared as well.

        The datasets must be compatible in the sense that both are of type
        TabularData or derived, and either labeled or unlabeled.

        Parameters:
            lhs: one of the two datasets to intersect ('left hand side')
            rhs: one of the two datasets to intersect ('right hand side')
            duplicates: if False (default), the returned data do not contain
                duplicate entries; if True, duplicates are taken into account.
                Both inputs and labels have to match for duplicates.

        Returns:
            TabularData containing only samples in both datasets, either without duplicates
            (set intersection) or taking duplicates into account (multiset intersection)

        Raises:
            NotImplementedError if the set intersection can not be computed
        """

        # parameter validation
        lhs = params.instance(lhs, TabularData)
        rhs = params.instance(rhs, TabularData)
        duplicates = params.boolean(duplicates)

        # special case: empty set
        if lhs.num_samples == 0:
            return lhs.subset()  # copy
        if rhs.num_samples == 0:
            return rhs.subset()  # copy

        if lhs.is_labeled != rhs.is_labeled:
            raise InvalidParameterError("compatible TabularData",
                                        "mismatch in labeling")

        # intersection calculation
        _lhs, _rhs = TabularData._joint_data_labels(
            lhs), TabularData._joint_data_labels(rhs)

        if _lhs.dtype != _rhs.dtype:
            raise InvalidParameterError(
                "Matching TabularData",
                f"{_lhs.dtype.descr} and {_rhs.dtype.descr}")

        if duplicates is False:
            _, indices, _ = np.intersect1d(
                _lhs, _rhs, return_indices=True)  # drops any duplicates
            indices = np.sort(indices)  # restores original order
            return lhs.subset(indices)
        else:  # duplicates = True
            raise NotImplementedError(  # todo: implement
                "specialized multiset intersection not implemented for TabularData"
            )
Exemplo n.º 7
0
    def __init__(self, learner: Learner, scorer: Scorer, maximize: bool = True):
        self._learner = params.instance(learner, Learner)
        self._scorer = params.instance(scorer, Scorer)

        self._maximize = params.boolean(maximize)
        # If the goal is to maximize the score, invert the value because optimizers minimize.
        if self.maximize:
            self._direction = -1
        else:
            self._direction = 1

        self._steps = []
Exemplo n.º 8
0
    def __init__(self,
                 rng: int = None,
                 maxiter: int = 1000,
                 local_search_options: Optional[dict] = None,
                 initial_temp: float = 5230.0,
                 restart_temp_ratio: float = 2e-05,
                 visit: float = 2.62,
                 accept: float = -5.0,
                 maxfun: int = 1e7,
                 no_local_search: bool = False,
                 **kwargs):
        """Initialize state.

        Scipy-specific parameters are passed through.

        Parameters:
            rng: integer seed. Will be used to generate a new seed each time the optimizer is run.
            maxiter: The maximum number of iterations, where one iteration is one round of
                simulated annealing followed by one use of a local optimizer to find a local min.
            local_search_options: an optional kwargs dictionary to pass to the local minimizer,
                scipy.optimize.minimize: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html
                If no args are passed then the minimizer defaults to the L-BFGS-B method, since
                the problems being studied have bounds but no constraints.
            initial_temp: The initial temperature, use higher values to facilitates a wider search
                and more easily escape local minima.
            restart_temp_ratio: The temperature, relative to the initial temperature, at which
                the annealing process restarts.
            visit: a parameter of the visiting distribution. A higher value corresponds to a
                heavier tail and longer potential jumps.
            accept: a parameter of the acceptance distribution. A lower value means that uphill
                moves are less likely to be accepted.
            maxfun: soft limit for the total number of function evaluation calls that may be exceeded only during a local optimization step if the quota is reached therein.
            no_local_search: if true then the local search step is skipped, and this reduces
                 to a generalized simulated annealing optimizer.
        """
        super().__init__(rng=rng, **kwargs)

        self._maxiter = params.integer(maxiter, from_=1)
        self._local_search_options = local_search_options or {
        }  # TODO: verify dictionaries
        self._initial_temp = params.real(initial_temp, above=0.01, to=5e4)
        self._restart_temp_ratio = params.real(restart_temp_ratio,
                                               above=0.0,
                                               below=1.0)
        self._visit = params.real(visit, above=0.0, to=3.0)
        self._accept = params.real(accept, above=-1e4, to=-5.0)
        self._maxfun = params.integer(maxfun, from_=1)
        self._no_local_search = params.boolean(no_local_search)
Exemplo n.º 9
0
    def best_score_trajectory(
        self, maximize: bool = True, length: Optional[int] = None
    ) -> Sequence[float]:
        """Calculate the best score found so far as a function of number of function evaluations.

        Parameters:
            maximize: whether the goal is to maximize (true) or minimize (false) the score
            length: total length of the result. If larger than the actual number of function
                evaluations, the result will be padded with the best value. If smaller than the
                actual number of evaluations, the result will be truncated.
                If None, the result is returned as-is.

        Returns:
            A sequence of floats, each one corresponding to the best score found at that point
            in the optimization trajectory.
        """
        maximize = params.boolean(maximize)
        length = params.optional_(length, lambda arg: params.integer(arg, from_=1))

        best_score = np.empty(self.num_evaluations)
        idx = 0
        best_score_so_far = self.steps[0].scores[0]
        direction = 1.0 if maximize else -1.0

        for optimization_iter in self.steps:
            for eval_ in optimization_iter.scores:
                if eval_ * direction > best_score_so_far * direction:
                    best_score_so_far = eval_
                best_score[idx] = best_score_so_far * direction
                idx += 1

        if length is not None:
            extra_padding = length - len(best_score)
            if extra_padding < 0:
                return best_score[:extra_padding]  # TODO: Raise a warning?
            return np.pad(best_score, ((0, extra_padding),), mode="edge")
        else:
            return best_score
Exemplo n.º 10
0
    def __init__(
        self,
        fits: bool = True,
        fit_lambda: float = 1e-7,
        fit_weights: Optional[str] = None,
        base=10,
        **kwargs,
    ):
        """Initialize learning curve plot.

        Parameters:
            fits: if True, show estimated asymptotic fits
            fit_lambda: regularization strength for asymptotic fits; defaults to 1e-7
            fit_weights: if and how to weight fits; one of
                None: no weighting, "variance": weigh by variance for each training set size
            base: base for logarithmic plotting
            All parameters from base classes, in particular GeneralizedFunctionPlot and Plot.
        """

        # set learning curve-specific arguments if not explicitly set
        kwargs["axes_scales"] = kwargs.get("axes_scales", ("log", "log"))
        kwargs["axes_labels"] = kwargs.get(
            "axes_labels", ("training set size", "evaluation metric", None, None)
        )

        super().__init__(**kwargs)

        # parameters
        self._fits = params.boolean(fits)
        self._fit_lambda = params.real(fit_lambda, from_=0)
        self._fit_weights = params.any_(
            fit_weights, lambda arg: params.enumeration(arg, {"variance"}), params.none
        )
        self._base = params.real(base, from_=2)

        self._logf = lambda x: np.log(x) / np.log(self._base)
        self._powf = lambda x: np.power(self._base, x)
Exemplo n.º 11
0
    def subset(self,
               indices: Optional[np.ndarray] = None,
               duplicates: bool = False) -> TabularData:
        """Create finite subset of data.

        Parameters:
            indices: a real matrix of appropriate dimensions (rows are vectors)
            duplicates: if True (default), the returned subset does not contain
                duplicate entries; if False, duplicates are kept. Both inputs
                and labels have to match for duplicates.

        Returns:
            Finite dataset of vectors.
        """

        # indices is validated by calls to samples() and labels()
        duplicates = params.boolean(duplicates)

        data = self.samples(indices)
        labels = self.labels(indices) if self.is_labeled else None

        ds = TabularData(data=data, labels=labels)

        return ds if not duplicates else ds.subset(duplicates=True)
Exemplo n.º 12
0
    def __init__(
        self,
        rng: int = None,
        uncertainties: Optional[str] = None,
        n_estimators: int = 100,
        criterion: str = "mse",
        max_depth: Optional[int] = None,
        min_samples_split: Union[int, float] = 2,
        min_samples_leaf: Union[int, float] = 1,
        min_weight_fraction_leaf: float = 0.0,
        max_features: Union[int, float, str, None] = "auto",
        max_leaf_nodes: Optional[int] = None,
        min_impurity_decrease: float = 0.0,
        # min_impurity_split deprecated
        bootstrap: bool = True,
        n_jobs: Optional[int] = None,
        ccp_alpha: float = 0.0,
        max_samples: Optional[Union[int, float]] = None,
        **kwargs,
    ):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            uncertainties: whether and how to compute predictive uncertainties; choices are
                None; by default, RandomForestRegressor does not return predictive uncertainties;
                "naive"; uses the ensembles standard deviation
            n_estimators: number of decision trees
            criterion: either variance reduction ("mse", mean squared error), or, mean absolute error ("mae")
            max_depth: maximum depth of a tree; default is restricted only by min_samples_leaf
            min_samples_split: minimum number of samples required to split an internal node;
                float numbers indicate a fraction of number of training samples
            min_samples_leaf: minimum number of training samples required in a leaf node
                float numbers indicate a fraction of number of training samples
            min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node
            max_features: number of features considered when splitting; integers directly specify the number,
                floating point values specify which fraction of all features to use;
                "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features
            max_leaf_nodes: maximum number of leaves a tree can have
            min_impurity_decrease: minimum impurity decrease required for splitting
            bootstrap: if False, the whole dataset is used to build trees
            n_jobs: number of parallel jobs; -1 to use all available processors; None means 1
            ccp_alpha: complexity parameter for minimal cost-complexity pruning.
            max_samples: number of input samples to draw during bootstrap; integers directly specify the number,
                floating point values specify which fraction of samples to use; all by default

        The sklearn.RandomForestRegressor parameters `oob_score`, `verbose`, `warm_restart` are not considered.

        See skl.ensemble.ExtraTreesRegressor parameters.
        """

        super().__init__(rng=rng, **kwargs)

        # validate parameters

        self._uncertainties = params.enumeration(uncertainties, {None, "naive"})

        n_estimators = params.integer(n_estimators, from_=1)
        criterion = params.enumeration(criterion, {"mse", "mae"})
        max_depth = params.any_(max_depth, lambda arg: params.integer(arg, from_=1), params.none)
        min_samples_split = params.any_(
            min_samples_split,
            lambda arg: params.integer(arg, from_=2),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_samples_leaf = params.any_(
            min_samples_leaf,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_weight_fraction_leaf = params.real(min_weight_fraction_leaf, from_=0.0, to=1.0)
        max_features = params.any_(
            max_features,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, to=1.0),
            lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}),
            params.none,
        )
        max_leaf_nodes = params.any_(
            max_leaf_nodes, lambda arg: params.integer(arg, from_=1), params.none
        )
        min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0)
        bootstrap = params.boolean(bootstrap)
        n_jobs = params.any_(
            n_jobs,
            lambda arg: params.integer(arg, from_=-1, to=-1),
            lambda arg: params.integer(arg, from_=1),
            params.none,
        )
        ccp_alpha = params.real(ccp_alpha, from_=0.0)
        max_samples = params.any_(
            max_samples,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, from_=0.0, to=1.0),
            params.none,
        )

        self._model = ExtraTreesRegressor(
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            bootstrap=bootstrap,
            n_jobs=n_jobs,
            ccp_alpha=ccp_alpha,
            max_samples=max_samples,
        )
Exemplo n.º 13
0
    def __init__(
        self,
        process: bool = True,
        join: bool = True,
        filter_: Callable[[dict], bool] = lambda _: True,
        samplef: Callable[[dict], dict] = lambda arg: arg,
        labelf: Callable[[float], Any] = lambda arg: arg,
        **kwargs,
    ):
        """Loads dataset.

        Parameters control preprocessing. Order:
        processing, joining, filtering, sample and label transform.

        Parameters:
            process: if False, entries are passed as-are; in particular, some formulas
                will contain variables (AxB1-x) and brackets; some labels will be
                intervals (from,to); if True, formulas are turned into simple
                sum formulas (no variables, no brackets) and all labels will be numbers
                CURRENTLY, such formulas are only flagged, but not parsed; only labels change
            join: whether to join entries with the same chemical sum formula; this changes
                labels from single numbers to varying-length sequences of numbers
            filter_: a function that accepts a sample and returns whether to keep it
                (True) or exclude it (False). Default retains all samples
            samplef: function accepting and returning a sample; applied to all samples
                as post-processing
            labelf: function accepting and returning a label; applied to all labels
                as post-processing

        A conservative parametrization is:
        SuperconductorsCitrine2016Dataset(
            process=True, join=True, 
            filter_=lambda e: not any(e["flagged_formula"]),
            samplef=lambda e: e["formula"],
            labelf=lambda tc: np.median(tc)
        )
        This results in a dataset of valid formulas with Tc as labels.

        All entries have these keys:
            "citation1": first citation URL
            "citation2": second citation URL if it exists, empty string otherwise
            "formula": chemical sum formula
            "Tc/K": superconducting critical temperature in K
            "process_Tc/K": True if label was changed in processing, False otherwise
            "process_formula": True if formula was changed in processing, False otherwise
            "flagged_formula": True if formula was flagged for some reason,
                including presence of variables (x, y) or unclear notation "+d"
        These entries can be used to filter.

        Raises:
            InvalidParameterError: on invalid parameter values

        Examples:
            sc = SuperconductorsCitrine2016Dataset()
            sc = SuperconductorsCitrine2016Dataset(process=True, filter_=lambda e: not e['flagged_formula'])
        """

        # todo: params test for functions with signature
        process = params.boolean(process)
        join = params.boolean(join)

        # load data
        data = self._load_data()

        # process data if requested
        if process:
            data = [self._process(e) for e in data]

        # join data if requested
        # the code below has roughly quadratic runtime. This does not matter for a small
        # dataset like this one, but this solution will not be adequate for larger datasets
        if join:
            # group data by unique formula
            # todo: canonicalize formula
            groups = {None: 0}
            for i, f in enumerate([e["formula"] for e in data]):
                groups[f] = groups.get(f, max(groups.values()) + 1)
                data[i]["group"] = groups[f]
            del groups[None]

            joined_data = []
            for f in groups.keys():  # iterate over unique formulae
                entry = {
                    "formula": f,
                    "citation1": [],
                    "citation2": [],
                    "Tc/K": [],
                    "process_Tc/K": [],
                    "process_formula": [],
                    "flagged_formula": [],
                }
                for e in data:
                    if e["formula"] == f:
                        for p in [
                                "citation1",
                                "citation2",
                                "Tc/K",
                                "process_Tc/K",
                                "process_formula",
                                "flagged_formula",
                        ]:
                            entry[p].append(e[p])
                joined_data.append(entry)
            data = joined_data

        # filter data
        data = [e for e in data if filter_(e)]

        # split out T_c as labels
        labels = [labelf(e["Tc/K"]) for e in data]
        for i in range(len(data)):
            del data[i]["Tc/K"]
            data[i] = samplef(data[i])

        # initialize state
        super().__init__(data=np.array(data),
                         labels=np.array(labels),
                         **kwargs)
Exemplo n.º 14
0
    def __init__(
        self,
        source: str,
        exclude_uncharacterized: bool = False,
        exclude_unconverged: bool = False,
        **kwargs,
    ):
        """Load dataset.

        See TabularDataFromPandas for parameters to control pre-processing on loading,
        such as joining, filtering, as well as sample and label transformations.

        Parameters:
            source: path to underlying data file (see class docstring)
            exclude_uncharacterized: exclude molecules listed in file 'uncharacterized.txt'
            exclude_unconverged: exclude molecules listed as hard to converge in file 'readme.txt'

        The files 'uncharacterized.txt' and 'readme.txt' are part of the original dataset.
        The indices of these uncharacterized and unconverged molecules are fixed; in particular,
        they are not loaded dynamically from the dataset.

        Samples:
            index: unique integer
            atomic_number: k-vector of atomic numbers (proton numbers)
            coordinates: k x 3 array of k 3d points (x,y,z)
            mulliken_charges: k-vector of Mulliken partial charges
            frequencies: frequencies (either 3k-5 or 3k-6)
            smiles_gdb9: SMILES encoding of the original GDB9 molecular structure graph
            smiles_relaxed: SMILES encoding of the relaxed-geometry molecular structure graph
            inchi_gdb9: InChI encoding of the original GDB9 molecular structure graph
            inchi_relaxed: InChI encoding of the relaxed-geometry molecular structure graph

        Labels:
             0  A       GHz          Rotational constant A
             1  B       GHz          Rotational constant B
             2  C       GHz          Rotational constant C
             3  mu      Debye        Dipole moment
             4  alpha   Bohr^3       Isotropic polarizability
             5  h**o    Hartree      Energy of Highest occupied molecular orbital (H**O)
             6  lumo    Hartree      Energy of Lowest occupied molecular orbital (LUMO)
             7  gap     Hartree      Gap, difference between LUMO and H**O
             8  r2      Bohr^2       Electronic spatial extent
             9  zpve    Hartree      Zero point vibrational energy
            10  U0      Hartree      Internal energy at 0 K
            11  U       Hartree      Internal energy at 298.15 K
            12  H       Hartree      Enthalpy at 298.15 K
            13  G       Hartree      Free energy at 298.15 K
            14  Cv      cal/(mol K)  Heat capacity at 298.15 K

        Raises:
            InvalidParameterError: on invalid parameter values
        """

        # parameter validation
        source = params.string(source)  # todo: params.filename
        exclude_uncharacterized = params.boolean(exclude_uncharacterized)
        exclude_unconverged = params.boolean(exclude_unconverged)

        # load raw data
        # bunzip2 takes about 7s for this 85 MB file
        # therefore, support both reading the unpacked file or the packed ones
        if source[-4:] == ".xyz":  # unpacked
            with open(source, "tr") as f:
                raw = f.read()
        elif source[-8:] == ".xyz.bz2":  # bz2-packed
            with open(source, "br") as f:
                raw = bz2.decompress(f.read()).decode(encoding="ascii")
        elif source[-4:] == ".zip":  # bz2-packed within zip archive
            with zipfile.ZipFile(source) as zf:
                with zf.open("dsgdb9nsd.xyz.bz2"
                             ) as f:  # filename as in downloaded dataset
                    raw = bz2.decompress(f.read()).decode(encoding="ascii")

        # parse data
        propnames = [
            "A",
            "B",
            "C",
            "mu",
            "alpha",
            "h**o",
            "lumo",
            "gap",
            "r2",
            "zpve",
            "U0",
            "U",
            "H",
            "G",
            "Cv",
        ]

        def parse(mol: str):
            lines = mol.split("\n")
            result = {}

            na = int(lines[0])  # number of atoms

            props = lines[1].split(
            )  # gdb identifier, molecule's index, and properties 1-15
            assert props[
                0] == "gdb", "internal error: wrong file format parsing QM9 molecule"
            result["index"] = int(props[1])
            assert len(propnames) == len(
                props[2:]), "internal error parsing QM9 molecule"
            for key, value in zip(
                    propnames,
                    props[2:],
            ):
                result[key] = float(value)

            atomblock = np.array([line.split() for line in lines[2:na + 2]
                                  ])  # array of strings

            result["atomic_number"] = [
                element_data(an, "Z") for an in atomblock[:, 0]
            ]
            result["coordinates"] = np.asfarray(atomblock[:, 1:4])
            result["mulliken_charges"] = np.asfarray(atomblock[:, 4])

            result["frequencies"] = np.asfarray(lines[na + 2].split())
            result["smiles_gdb9"], result["smiles_relaxed"] = lines[na +
                                                                    3].split()
            result["inchi_gdb9"], result["inchi_relaxed"] = lines[na +
                                                                  4].split()

            return result

        # alternative via qmmlpack:
        # qmml.import_extxyz(raw, additional_properties=True)

        parsed = [parse(entry) for entry in raw.split("\n\n")]
        data = pd.DataFrame(parsed)

        # drop molecule subsets if requested
        if exclude_uncharacterized:
            filename = os.path.join(os.path.dirname(__file__),
                                    "uncharacterized.txt")
            with open(filename, "rt") as f:
                excluded = f.read().split("\n")
                while not excluded[0].startswith("  "):  # drop header lines
                    del excluded[0]
                while not excluded[-1].startswith("  "):  # drop footer lines
                    del excluded[-1]
                excluded = [int(line.split()[0]) for line in excluded]
            data = data[~data["index"].isin(excluded)]

        if exclude_unconverged:
            excluded = [
                21725,
                87037,
                59827,
                117523,
                128113,
                129053,
                129152,
                129158,
                130535,
                6620,
                59818,
            ]
            data = data[~data["index"].isin(excluded)]

        super().__init__(data=data, labels=propnames, **kwargs)
Exemplo n.º 15
0
    def __init__(
        self,
        num_trees: int = -1,
        use_jackknife: bool = True,
        bias_learner: Optional[BaseLoloLearner] = None,
        leaf_learner: Optional[BaseLoloLearner] = None,
        subset_strategy: Union[str, int, float] = "auto",
        min_leaf_instances: int = 1,
        max_depth: int = 2 ** 30,
        uncertainty_calibration: bool = False,
        randomize_pivot_location: bool = False,
        # randomly_rotate_features: bool = False, currently in develop branch
        **kwargs
    ):
        """Initialize random forest model.

        See lolo Scala source code for initialization parameters:
        https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala

        When using `uncertainty_calibration=False` (the default), the number of trees
        `num_trees` should be set to a multiple of the number n of training samples,
        `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`,
        `num_trees = 64` is sufficient.

        Parameters:
            num_trees: number of trees in the forest; -1 uses number of training samples
            use_jackknife: whether to use jackknife-based variance estimates
            bias_learner: algorithm used to model bias
            leaf_learner: algorithm used at each leaf of the random forest
            subset_strategy: strategy to determine number of features used at each split
                "auto": use the default for lolo (all features for regression, sqrt for classification)
                "log2": use the base 2 log of the number of features
                "sqrt": use the square root of the number of features
                integer: set the number of features explicitly
                float: use a certain fraction of the features
            min_leaf_instances: minimum number of features used at each leaf
            max_depth: maximum depth of decision trees
            uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties
                based on out-of-bag residuals
            randomize_pivot_location: whether to draw pivots randomly or always select the midpoint
            randomly_rotate_features: whether to rotate real scalar fetures for each tree
        """

        super().__init__(**kwargs)

        # validate parameters

        num_trees = params.any_(
            num_trees,
            lambda i: params.integer(i, above=0),
            lambda i: params.integer(i, from_=-1, to=-1),
        )

        use_jackknife = params.boolean(use_jackknife)

        bias_learner = params.any_(
            bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        leaf_learner = params.any_(
            leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        subset_strategy = params.any_(
            subset_strategy,
            lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}),
            lambda s: params.integer(s, above=0),
            lambda s: params.real(s, above=0),
        )

        min_leaf_instances = params.integer(min_leaf_instances, above=0)

        # the default 2**30 works for 32 bit or larger architectures
        max_depth = params.integer(max_depth, above=0)

        uncertainty_calibration = params.boolean(uncertainty_calibration)

        randomize_pivot_location = params.boolean(randomize_pivot_location)

        # randomly_rotate_features = params.boolean(randomly_rotate_features)

        # set up model

        try:
            self._model = RandomForestRegressor(
                num_trees=num_trees,
                use_jackknife=use_jackknife,
                bias_learner=bias_learner,
                leaf_learner=leaf_learner,
                subset_strategy=subset_strategy,
                min_leaf_instances=min_leaf_instances,
                max_depth=max_depth,
                uncertainty_calibration=uncertainty_calibration,
                randomize_pivot_location=randomize_pivot_location,
                # randomly_rotate_features=randomly_rotate_features,
            )
        except Py4JJavaError as e:
            raise BenchmarkError("instantiating lolo model failed") from e

        self._with_uncertainties = use_jackknife  # otherwise, deviations will be zero
Exemplo n.º 16
0
    def _complement(lhs: "TabularData",
                    rhs: "TabularData",
                    duplicates: bool = False) -> "TabularData":
        """Specialized (multi)set complement.

        For labeled data, labels are compared as well.

        The datasets must be compatible in the sense that both are of type
        DataMatrix or derived, and either labeled or unlabeled.

        Parameters:
            lhs: set A in A - B ('left hand side')
            rhs: set B in A - B ('right hand side')
            duplicates: if False (default), the returned data do not contain
                duplicate entries; if True, duplicates are taken into account.
                Both inputs and labels have to match for duplicates.

        Returns:
            Data containing all samples in lhs, but not in rhs, either without duplicates
            (set complement) or taking duplicates into account (multiset complement).
        """

        # parameter validation
        lhs = params.instance(lhs, TabularData)
        rhs = params.instance(rhs, TabularData)
        duplicates = params.boolean(duplicates)

        # special case: empty set
        if lhs.num_samples == 0:
            return lhs.subset()
        if rhs.num_samples == 0:
            return lhs.subset()

        if lhs.is_labeled != rhs.is_labeled:
            raise InvalidParameterError("compatible TabularData",
                                        "mismatch in labeling")

        # complement calculation
        _lhs, _rhs = TabularData._joint_data_labels(
            lhs), TabularData._joint_data_labels(rhs)

        if _lhs.dtype != _rhs.dtype:
            raise InvalidParameterError(
                "Matching TabularData",
                f"{_lhs.dtype.descr} and {_rhs.dtype.descr}")

        if duplicates is False:
            # np.setdiff1d does not return indices, so we don't use it

            indices = np.arange(_lhs.size)[np.isin(
                _lhs, _rhs, invert=True)]  # indexes into _lhs
            _, indices2 = np.unique(_lhs[indices],
                                    return_index=True)  # indexes into indices
            indices = indices[np.sort(indices2)]  # restores order

            return lhs.subset(indices)

            # below implementation is correct but a bit slower:

            # # remove duplicates from _lhs
            # _, indices = np.unique(_lhs, return_index=True)
            # indices = np.sort(indices)  # restores original order
            # _lhs = _lhs[indices]

            # # remove any element from _rhs
            # _, indices, _ = np.intersect1d(_lhs, _rhs, return_indices=True)
            # indices = np.setdiff1d(np.arange(_lhs.size), indices, assume_unique=True)
        else:  # duplicates = True
            raise NotImplementedError(  # todo: implement
                "specialized multiset complement not implemented for TabularData"
            )
Exemplo n.º 17
0
    def subset(self,
               indices: Optional[Sequence[int]] = None,
               duplicates: bool = False) -> "TabularData":
        """Create finite subset of data.

        Parameters:
            indices: A sequence of non-negative integers in the range [0, n),
                where n is number of samples. If no indices are specified,
                the whole dataset is returned.
            duplicates: if False (default), the returned subset does not contain
                duplicate entries; if True, duplicates are kept. Both inputs
                and labels have to match for duplicates.

        Returns:
            TabularData that contains only the specified samples.

        If duplicates are dropped, the first occurrence is kept.
        """

        # validate parameters
        indices = self._indices_testf(indices)
        duplicates = params.boolean(duplicates)

        # special case: empty set
        if self.num_samples == 0:
            if indices is not None and len(indices) > 0:
                raise InvalidParameterError("empty indices", indices,
                                            "indices into empty set")
            empty = np.empty(shape=(0, ) + self._data.shape[1:],
                             dtype=self._data.dtype)
            return TabularData(data=empty,
                               labels=[] if self.is_labeled else None)

        # special case: empty subset
        if indices is not None and len(indices) == 0:
            empty = np.empty(shape=(0, ) + self._data.shape[1:],
                             dtype=self._data.dtype)
            return TabularData(
                data=empty, labels=np.array([]) if self.is_labeled else None)

        # default is to return the whole set
        if indices is None:
            indices = ...  # Ellipsis

        # create subset data and labels
        subset = TabularData(
            data=self._data[indices],
            labels=self._labels[indices] if self.is_labeled else None)

        # remove duplicates if required
        if duplicates is False:
            joint = self._joint_data_labels(subset)
            _, unique = np.unique(joint, return_index=True)
            if len(
                    unique
            ) != subset.num_samples:  # only do work if there are any duplicates
                unique = np.sort(unique)  # restores original order
                subset = TabularData(
                    data=subset._data[unique],
                    labels=subset._labels[unique]
                    if subset.is_labeled else None,
                )
        # else: pass

        return subset
Exemplo n.º 18
0
    def __init__(
            self,
            select: Union[str, Sequence[str]] = "all",
            samplef: Callable[[Any], Any] = lambda arg: arg,
            stoichiometry_p_list: Sequence[int] = (0, 2, 3, 5, 7, 10),
            elemental_preset: str = "magpie",
            ionic_fast: bool = False,
            valence_orbitals: Sequence[str] = ("s", "p", "d", "f"),
            valence_props: Sequence[str] = ("avg", "frac"),
            **kwargs,
    ):
        """Initialize state.

        Selected parameters of the wrapped matminer classes Stoichiometry, ElementProperty,
        IonProperty, ValenceOrbital can be passed through. These parameters are prefixed
        with stoichiometry, elemental, ionic, valence. For example, stoichiometry_p_list
        is the p_list parameter of Stoichiometry. For further details on these, see
        https://github.com/hackingmaterials/matminer/blob/master/matminer/featurizers/composition.py

        Parameters:
            select: which feature sets to compute (by default, all). Specifying
                multiple sets (e.g., ('stoichiometry', 'elemental') selects both).
                Valid choices:
                'all': all features
                'stoichiometry': norms of stoichiometric features
                'elemental': element properties
                'ionic': ion properties
                'valence': valence orbital shell features
            samplef: a function accepting and returning a sample. This enables
                transformation of samples, for example, to select an entry by key
                if sample is a dictionary, or to turn a dictionary into a vector.
                Default is to return the sample unchanged.
            stoichiometry_p_list: list of L_p norms to compute
            elemental_preset: matminer preset to use. Valid choices include:
                'magpie', 'deml', 'matminer', 'matscholar_el', 'megnet_el'
            ionic_fast: if True, assumes that elements exist in single oxidation state
            valence_orbitals: which valence orbitals to consider
            valence_props: whether to return average properties, fractional, or both

        Requires the matminer package (see file documentation).
        """

        super().__init__(**kwargs)

        SELECT_SETS = ("stoichiometry", "elemental", "ionic", "valence")

        if select == "all":
            select = SELECT_SETS
        if isinstance(select, str):
            select = (select,
                      )  # tuple(str,) yields tuple of characters in str
        select = params.tuple_(
            select,
            lambda arg: params.enumeration(arg, set(SELECT_SETS)),
        )

        self._stoichiometry_p_list = params.tuple_(
            stoichiometry_p_list, lambda p: params.integer(p, from_=0))
        self._elemental_preset = params.enumeration(
            elemental_preset,
            {"magpie", "deml", "matminer", "matscholar_el", "megnet_el"})
        self._ionic_fast = params.boolean(ionic_fast)
        self._valence_orbitals = params.tuple_(
            valence_orbitals,
            lambda arg: params.enumeration(arg, {"s", "p", "d", "f"}))
        self._valence_props = params.tuple_(
            valence_props,
            lambda arg: params.enumeration(arg, {"avg", "frac"}))

        self.samplef = samplef  # todo: add callable to params

        # set up matminer
        try:
            import matminer
            import matminer.featurizers
            import matminer.featurizers.base
            import matminer.featurizers.composition
            import matminer.featurizers.conversions
            import pymatgen
        except ModuleNotFoundError as e:
            raise BenchmarkError(
                f"'{type(self).__name__}' requires 'matminer' and 'pymatgen' packages"
            ) from e

        self._composition = pymatgen.core.composition.Composition

        # set up features
        features = []
        if "stoichiometry" in select:
            features.append(
                matminer.featurizers.composition.Stoichiometry(
                    p_list=self._stoichiometry_p_list))
        if "elemental" in select:
            features.append(
                matminer.featurizers.composition.ElementProperty.from_preset(
                    self._elemental_preset))
        if "ionic" in select:
            features.append(
                matminer.featurizers.composition.IonProperty(
                    fast=self._ionic_fast))
        if "valence" in select:
            features.append(
                matminer.featurizers.composition.ValenceOrbital(
                    orbitals=self._valence_orbitals,
                    props=self._valence_props))

        self._mmfeatures = matminer.featurizers.base.MultipleFeaturizer(
            features)