Пример #1
0
 def test_split_xz(self):
     x_dims = [(1, 10), (1, 10), (1, 10)]
     x = [1, 2, 3]
     z = ["red", "monkey"]
     xz = x + z
     x_split, z_split = split_xz(xz, x_dims)
     self.assertListEqual(x, x_split)
     self.assertListEqual(z, z_split)
Пример #2
0
 def test_split_xz(self):
     x_dims = [(1, 10), (1, 10), (1, 10)]
     x = [1, 2, 3]
     z = ["red", "monkey"]
     xz = x + z
     x_split, z_split = split_xz(xz, x_dims)
     self.assertListEqual(x, x_split)
     self.assertListEqual(z, z_split)
Пример #3
0
    def _z_dims(self, all_xz_unsearched, all_xz_searched):
        """
        Prepare dims to use in preprocessing for categorical dimensions.
        Gathers a list of possible dimensions from stored and current z vectors.
        Not actually used for creating a list of possible search points, only
        for helping to convert possible search points from categorical to
        integer/float.

        Args:
            all_xz_unsearched ([list]): The collection of xz points which have
                not been searched.
            all_xz_searched ([list]): The collection of xz points which have
                been searched.

        Returns:
            ([tuple]) dimensions for the z space
        """

        all_z_unsearched = [
            split_xz(xz, self.x_dims, z_only=True) for xz in all_xz_unsearched
        ]
        all_z_searched = [
            split_xz(xz, self.x_dims, z_only=True) for xz in all_xz_searched
        ]
        all_z = all_z_searched + all_z_unsearched

        if not all_z:
            return []

        dims = [(z, z) for z in all_z[0]]

        for i, dim in enumerate(dims):
            cat_values = []
            for z in all_z:
                if type(z[i]) in dtypes.others:
                    # the dimension is categorical
                    if z[i] not in cat_values:
                        cat_values.append(z[i])
                        dims[i] = cat_values
        return dims
Пример #4
0
    def _z_dims(self, all_xz_unsearched, all_xz_searched):
        """
        Prepare dims to use in preprocessing for categorical dimensions.
        Gathers a list of possible dimensions from stored and current z vectors.
        Not actually used for creating a list of possible search points, only
        for helping to convert possible search points from categorical to
        integer/float.

        Args:
            all_xz_unsearched ([list]): The collection of xz points which have
                not been searched.
            all_xz_searched ([list]): The collection of xz points which have
                been searched.
        
        Returns:
            ([tuple]) dimensions for the z space
        """

        all_z_unsearched = [split_xz(xz, self.x_dims, z_only=True) for xz in
                            all_xz_unsearched]
        all_z_searched = [split_xz(xz, self.x_dims, z_only=True) for xz in
                          all_xz_searched]
        all_z = all_z_searched + all_z_unsearched

        if not all_z:
            return []

        dims = [(z, z) for z in all_z[0]]

        for i, dim in enumerate(dims):
            cat_values = []
            for z in all_z:
                if type(z[i]) in dtypes.others:
                    # the dimension is categorical
                    if z[i] not in cat_values:
                        cat_values.append(z[i])
                        dims[i] = cat_values
        return dims
Пример #5
0
    def stash(self, x, y, z, all_xz_new, n_completed):
        """
        Write documents to database after optimization.

        Args:
            x (iterable): The current x guess.
            y: (iterable): The current y (objective function) value
            z (iterable): The z vector associated with x
            all_xz_new ([list] or [tuple]): The predicted next best guess(es),
                including their associated z vectors
            n_completed (int): The number of completed guesses/workflows

        Returns:
            opt_id (pymongo InsertedOneResult): The result of the insertion
                of the new optimization document in the database. If multiple
                opt_ids are valid (ie batch mode is enabled), the last opt_id
                is returned.
        """

        for xz_new in all_xz_new:
            # separate 'predicted' z features from the new x vector
            x_new, z_new = split_xz(xz_new, self.x_dims)
            x_new = convert_native(x_new)
            z_new = convert_native(z_new)

            # if it is a duplicate (such as a forced
            # identical first guess)
            forced_dupe = self.c.find_one({"x": x})

            acqmap = {
                "ei": "Expected Improvement",
                "pi": "Probability of Improvement",
                "lcb": "Lower Confidence Boundary",
                None: "Highest Value",
                "maximin": "Maximin Expected "
                "Improvement",
            }
            if self.predictor in self.builtin_predictors:
                predictorstr = (self.predictor + " with acquisition: " +
                                acqmap[self.acq])
                if self.n_objs > 1:
                    predictorstr += " using {} objectives".format(self.n_objs)
            else:
                predictorstr = self.predictor
            if forced_dupe:
                # only update the fields which should be updated
                self.c.find_one_and_update(
                    {"x": x},
                    {
                        "$set": {
                            "y": y,
                            "z": z,
                            "z_new": z_new,
                            "x_new": x_new,
                            "predictor": predictorstr,
                        }
                    },
                )
            else:
                # update all the fields, as it is a new document
                self.c.insert_one({
                    "z": z,
                    "y": y,
                    "x": x,
                    "z_new": z_new,
                    "x_new": x_new,
                    "predictor": predictorstr,
                    "index": n_completed + 1,
                })
            # ensure previously fin. workflow results are not overwritten by
            # concurrent predictions
            if (self.c.count_documents({
                    "x": x_new,
                    "y": {
                        "$exists": 1,
                        "$ne": "reserved"
                    }
            }) == 0):
                # reserve the new x to prevent parallel processes from
                # registering it as unsearched, since the next iteration of this
                # process will be exploring it
                res = self.c.insert_one({"x": x_new, "y": "reserved"})
                opt_id = res.inserted_id
            else:
                raise ValueError(
                    "The predictor suggested a guess which has already been "
                    "tried: {}".format(x_new))
        return opt_id
Пример #6
0
    def optimize(self, fw_spec, manager_id):
        """
        Run the optimization algorithm.

        Args:
            fw_spec (dict): The firework spec.
            manager_id (ObjectId): The MongoDB object id of the manager
                document.

        Returns:
            x (iterable): The current x guess.
            y: (iterable): The current y (objective function) value
            z (iterable): The z vector associated with x
            all_xz_new ([list] or [tuple]): The predicted next best guess(es),
                including their associated z vectors
            n_completed (int): The number of completed guesses/workflows
        """
        x = list(fw_spec["_x"])
        y = fw_spec["_y"]
        if isinstance(y, (list, tuple)):
            if len(y) == 1:
                y = y[0]
            self.n_objs = len(y)
            if self.acq not in ("maximin", None):
                raise ValueError(
                    "{} is not a valid acquisition function for multiobjective "
                    "optimization".format(self.acq))
        else:
            if self.acq == "maximin":
                raise ValueError(
                    "Maximin is not a valid acquisition function for single "
                    "objective optimization.")
            self.n_objs = 1

        # If process A suggests a certain guess and runs it, process B may
        # suggest the same guess while process A is running its new workflow.
        # Therefore, process A must reserve the guess. Line below releases
        # reservation on this document in case of workflow failure or end of
        # workflow.
        self.c.delete_one({"x": x, "y": "reserved"})

        # fetch additional attributes for constructing ML model
        z = self.get_z(x, *self.get_z_args, **self.get_z_kwargs)

        # use all possible training points as default
        n_completed = self.c.count_documents(self._completed)
        if not self.n_train_pts or self.n_train_pts > n_completed:
            self.n_train_pts = n_completed

        # check if opimization should be done, if in batch mode
        batch_mode = False if self.batch_size == 1 else True
        batch_ready = (n_completed not in (0, 1)
                       and (n_completed + 1) % self.batch_size == 0)

        x = convert_native(x)
        y = convert_native(y)
        z = convert_native(z)

        if batch_mode and not batch_ready:
            # 'None' predictor means this job was not used for
            # an optimization run.
            if self.c.find_one({"x": x}):
                if self.c.find_one({"x": x, "y": "reserved"}):
                    # For reserved guesses: update everything
                    self.c.find_one_and_update(
                        {
                            "x": x,
                            "y": "reserved"
                        },
                        {
                            "$set": {
                                "y": y,
                                "z": z,
                                "z_new": [],
                                "x_new": [],
                                "predictor": None,
                                "index": n_completed + 1,
                            }
                        },
                    )
                else:
                    # For completed guesses (ie, this workflow
                    # is a forced duplicate), do not update
                    # index, but update everything else
                    self.c.find_one_and_update(
                        {"x": x},
                        {
                            "$set": {
                                "y": y,
                                "z": z,
                                "z_new": [],
                                "x_new": [],
                                "predictor": None,
                            }
                        },
                    )
            else:
                # For new guesses: insert x, y, z, index,
                # predictor, and dummy new guesses
                self.c.insert_one({
                    "x": x,
                    "y": y,
                    "z": z,
                    "x_new": [],
                    "z_new": [],
                    "predictor": None,
                    "index": n_completed + 1,
                })
            self.pop_lock(manager_id)
            raise BatchNotReadyError

        # Mongo aggregation framework may give duplicate documents, so we cannot
        # use $sample to randomize the training points used
        searched_indices = random.sample(range(1, n_completed + 1),
                                         self.n_train_pts)
        searched_docs = self.c.find({"index": {
            "$in": searched_indices
        }},
                                    batch_size=10000)
        reserved_docs = self.c.find({"y": "reserved"}, batch_size=10000)
        reserved = []
        for doc in reserved_docs:
            reserved.append(doc["x"])
        all_y = [None] * n_completed
        all_y.append(y)
        all_x_searched = [None] * n_completed
        all_x_searched.append(x)
        z = list(z)
        all_xz_searched = [None] * n_completed
        all_xz_searched.append(x + z)
        for i, doc in enumerate(searched_docs):
            all_x_searched[i] = doc["x"]
            all_xz_searched[i] = doc["x"] + doc["z"]
            all_y[i] = doc["y"]

        all_x_space = self._discretize_space(self.x_dims)
        all_x_space = list(all_x_space) if self.z_file else all_x_space
        all_x_unsearched = []
        for xi in all_x_space:
            xj = list(xi)
            if xj not in all_x_searched and xj not in reserved:
                all_x_unsearched.append(xj)
                if len(all_x_unsearched) == self.n_search_pts:
                    break

        if self.z_file:
            if path.exists(self.z_file):
                with open(self.z_file, "rb") as f:
                    xz_map = pickle.load(f)
            else:
                xz_map = {
                    tuple(xi): self.get_z(xi, *self.get_z_args,
                                          **self.get_z_kwargs)
                    for xi in all_x_space
                }
                with open(self.z_file, "wb") as f:
                    pickle.dump(xz_map, f)

            all_xz_unsearched = [
                xi + xz_map[tuple(xi)] for xi in all_x_unsearched
            ]
        else:
            all_xz_unsearched = [
                xi + self.get_z(xi, *self.get_z_args, **self.get_z_kwargs)
                for xi in all_x_unsearched
            ]

        # if there are no more unsearched points in the entire
        # space, either they have been searched (ie have x, y,
        # and z) or have been reserved.
        if len(all_xz_unsearched) < 1:
            if self.is_discrete_all:
                raise ExhaustedSpaceError(
                    "The discrete space has been searched"
                    " exhaustively.")
            else:
                raise TypeError("A comprehensive list of points was exhausted "
                                "but the dimensions are not discrete.")
        z_dims = self._z_dims(all_xz_unsearched, all_xz_searched)
        xz_dims = self.x_dims + z_dims

        # run machine learner on Z or X features
        if self.predictor in self.builtin_predictors:
            model = self.builtin_predictors[self.predictor]
            all_xz_searched = self._encode(all_xz_searched, xz_dims)
            all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims)
            all_xz_new_onehot = []
            for _ in range(self.batch_size):
                xz1h = self._predict(
                    all_xz_searched,
                    all_y,
                    all_xz_unsearched,
                    model(*self.predictor_args, **self.predictor_kwargs),
                    self.maximize,
                    scaling=True,
                )
                ix = all_xz_unsearched.index(xz1h)
                all_xz_unsearched.pop(ix)
                all_xz_new_onehot.append(xz1h)
            all_xz_new = [
                self._decode(xz_onehot, xz_dims)
                for xz_onehot in all_xz_new_onehot
            ]

        elif self.predictor == "random":
            all_xz_new = random.sample(all_xz_unsearched, self.batch_size)

        else:
            # If using a custom predictor, automatically convert
            # categorical info to one-hot encoded ints.
            # Used when a custom predictor cannot natively use
            # categorical info
            if self.onehot_categorical:
                all_xz_searched = self._encode(all_xz_searched, xz_dims)
                all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims)

            try:
                predictor_fun = deserialize(self.predictor)
            except Exception as E:
                raise NameError("The custom predictor {} didnt import "
                                "correctly!\n{}".format(self.predictor, E))

            all_xz_new = predictor_fun(
                all_xz_searched,
                all_y,
                self.x_dims,
                all_xz_unsearched,
                *self.predictor_args,
                **self.predictor_kwargs,
            )
            if self.onehot_categorical:
                all_xz_new = self._decode(all_xz_new, xz_dims)

            if not isinstance(all_xz_new[0], (list, tuple)):
                all_xz_new = [all_xz_new]

        # duplicate checking for custom optimizer functions
        if self.duplicate_check:

            if not self.enforce_sequential:
                raise ValueError(
                    "Duplicate checking cannot work when "
                    "optimizations are not enforced sequentially.")
            if (self.predictor not in self.builtin_predictors
                    and self.predictor != "random"):
                all_x_new = [
                    split_xz(xz_new, self.x_dims, x_only=True)
                    for xz_new in all_xz_new
                ]
                all_x_searched = [
                    split_xz(xz, self.x_dims, x_only=True)
                    for xz in all_xz_searched
                ]
                if self.tolerances:
                    for n, x_new in enumerate(all_x_new):
                        if is_duplicate_by_tolerance(
                                x_new, all_x_searched,
                                tolerances=self.tolerances):
                            all_xz_new[n] = random.choice(all_xz_unsearched)
                else:
                    if self.is_discrete_all:
                        # test only for x, not xz because custom predicted z
                        # may not be accounted for
                        for n, x_new in enumerate(all_x_new):
                            if x_new in all_x_searched or x_new == x:
                                all_xz_new[n] = random.choice(
                                    all_xz_unsearched)
                    else:
                        raise ValueError("Define tolerances parameter to "
                                         "duplicate check floats.")
        return x, y, z, all_xz_new, n_completed
Пример #7
0
    def run_task(self, fw_spec):
        """
        FireTask for running an optimization loop.

        Args:
            fw_spec (dict): the firetask spec. Must contain a '_y' key with
            a float type field and must contain a '_x' key containing a
            vector uniquely defining the point in search space.

        Returns:
            (FWAction) A workflow based on the workflow creator and a new,
            optimized guess.
        """
        pid = f"{getpid()}@{gethostname()}"
        sleeptime = 0.01
        max_runs = int(self.timeout / sleeptime)
        max_resets = 3

        # Running stepwise optimization for concurrent processes requires a
        # manual 'lock' on the optimization database to prevent duplicate
        # guesses. The first process sets up a manager document which handles
        # locking and queueing processes by PID. The single, active process in
        # the lock is free to access optimization data; the queue of the manager
        # holds parallel process PIDs waiting to access the db. When the active
        # process finishes, it removes itself from the lock and moves the first
        # queue PID into the lock, allowing the next process to begin
        # optimization. Each process continually tries to either queue or place
        # itself into the lock if not active.

        for run in range(max_resets * max_runs):
            manager_count = self.c.count_documents(self._manager)
            if manager_count == 0:
                self.c.insert_one({
                    "lock": pid,
                    "queue": [],
                    "doctype": "manager"
                })
            elif manager_count == 1:
                # avoid bootup problems if manager lock is being deleted
                # concurrently with this check
                try:
                    manager = self.c.find_one(self._manager)
                    manager_id = manager["_id"]
                    lock = manager["lock"]
                except TypeError:
                    continue

                if lock is None:
                    self.c.find_one_and_update({"_id": manager_id},
                                               {"$set": {
                                                   "lock": pid
                                               }})

                elif self.enforce_sequential and lock != pid:
                    if pid not in manager["queue"]:

                        # avoid bootup problems if manager queue is being
                        # deleted concurrently with this check
                        try:
                            self.c.find_one_and_update(
                                {"_id": manager_id}, {"$push": {
                                    "queue": pid
                                }})
                        except TypeError:
                            continue
                    else:
                        sleep(sleeptime)
                elif not self.enforce_sequential or (self.enforce_sequential
                                                     and lock == pid):
                    try:
                        x, y, z, all_xz_new, n_completed = self.optimize(
                            fw_spec, manager_id)
                    except BatchNotReadyError:
                        return None
                    except Exception:
                        self.pop_lock(manager_id)
                        raise

                    # make sure a process has not timed out and changed the lock
                    # pid while this process is computing the next guess
                    try:
                        if (self.c.find_one(self._manager)["lock"] != pid
                                or self.c.count_documents(self._manager) == 0):
                            continue
                        else:
                            opt_id = self.stash(x, y, z, all_xz_new,
                                                n_completed)
                    except TypeError as E:
                        warnings.warn(
                            "Process {} probably timed out while "
                            "computing next guess, with exception {}."
                            " Try shortening the training time or "
                            "lengthening the timeout for OptTask!"
                            "".format(pid, E),
                            RuntimeWarning,
                        )
                        raise E
                        # continue
                    self.pop_lock(manager_id)
                    all_x_new = [
                        split_xz(xz_new, self.x_dims, x_only=True)
                        for xz_new in all_xz_new
                    ]
                    if not isinstance(self.wf_creator_args, (list, tuple)):
                        raise TypeError(
                            "wf_creator_args should be a list/tuple of "
                            "positional arguments.")

                    if not isinstance(self.wf_creator_kwargs, dict):
                        raise TypeError(
                            "wf_creator_kwargs should be a dictionary of "
                            "keyword arguments.")

                    new_wfs = [
                        self.wf_creator(x_new, *self.wf_creator_args,
                                        **self.wf_creator_kwargs)
                        for x_new in all_x_new
                    ]
                    for wf in new_wfs:
                        self.lpad.add_wf(wf)
                    return FWAction(
                        update_spec={"_optimization_id": opt_id},
                        stored_data={"_optimization_id": opt_id},
                    )
            else:
                # Delete the manager that this has created
                self.c.delete_one({"lock": pid})

            if run in [max_runs * k for k in range(1, max_resets)]:
                self.c.find_one_and_update(
                    self._manager, {"$set": {
                        "lock": None,
                        "queue": []
                    }})

            elif run == max_runs * max_resets:
                raise Exception("The manager is still stuck after "
                                "resetting. Make sure no stalled processes "
                                "are in the queue.")
Пример #8
0
    def stash(self, x, y, z, all_xz_new, n_completed):
        """
        Write documents to database after optimization.

        Args:
            x (iterable): The current x guess.
            y: (iterable): The current y (objective function) value
            z (iterable): The z vector associated with x
            all_xz_new ([list] or [tuple]): The predicted next best guess(es),
                including their associated z vectors
            n_completed (int): The number of completed guesses/workflows

        Returns:
            opt_id (pymongo InsertedOneResult): The result of the insertion
                of the new optimization document in the database. If multiple
                opt_ids are valid (ie batch mode is enabled), the last opt_id
                is returned.
        """

        for xz_new in all_xz_new:
            # separate 'predicted' z features from the new x vector
            x_new, z_new = split_xz(xz_new, self.x_dims)
            x_new = convert_native(x_new)
            z_new = convert_native(z_new)

            # if it is a duplicate (such as a forced
            # identical first guess)
            forced_dupe = self.c.find_one({'x': x})

            acqmap = {"ei": "Expected Improvement",
                      "pi": "Probability of Improvement",
                      "lcb": "Lower Confidence Boundary",
                      None: "Highest Value",
                      "maximin": "Maximin Expected "
                                 "Improvement"}
            if self.predictor in self.builtin_predictors:
                predictorstr = self.predictor + " with acquisition: " + acqmap[
                    self.acq]
                if self.n_objs > 1:
                    predictorstr += " using {} objectives".format(self.n_objs)
            else:
                predictorstr = self.predictor
            if forced_dupe:
                # only update the fields which should be updated
                self.c.find_one_and_update(
                    {'x': x},
                    {'$set': {'y': y, 'z': z,
                              'z_new': z_new,
                              'x_new': x_new,
                              'predictor': predictorstr}
                     })
            else:
                # update all the fields, as it is a new document
                self.c.insert_one(
                    {'z': z, 'y': y, 'x': x, 'z_new': z_new, 'x_new': x_new,
                     'predictor': predictorstr, 'index': n_completed + 1})
            # ensure previously fin. workflow results are not overwritten by
            # concurrent predictions
            if self.c.count_documents(
                    {'x': x_new, 'y': {'$exists': 1, '$ne': 'reserved'}}) == 0:
                # reserve the new x to prevent parallel processes from
                # registering it as unsearched, since the next iteration of this
                # process will be exploring it
                res = self.c.insert_one({'x': x_new, 'y': 'reserved'})
                opt_id = res.inserted_id
            else:
                raise ValueError(
                    "The predictor suggested a guess which has already been "
                    "tried: {}".format(x_new))
        return opt_id
Пример #9
0
    def optimize(self, fw_spec, manager_id):
        """
        Run the optimization algorithm.

        Args:
            fw_spec (dict): The firework spec.
            manager_id (ObjectId): The MongoDB object id of the manager
                document.

        Returns:
            x (iterable): The current x guess.
            y: (iterable): The current y (objective function) value
            z (iterable): The z vector associated with x
            all_xz_new ([list] or [tuple]): The predicted next best guess(es),
                including their associated z vectors
            n_completed (int): The number of completed guesses/workflows
        """
        x = list(fw_spec['_x'])
        y = fw_spec['_y']
        if isinstance(y, (list, tuple)):
            if len(y) == 1:
                y = y[0]
            self.n_objs = len(y)
            if self.acq not in ("maximin", None):
                raise ValueError(
                    "{} is not a valid acquisition function for multiobjective "
                    "optimization".format(self.acq))
        else:
            if self.acq == "maximin":
                raise ValueError(
                    "Maximin is not a valid acquisition function for single "
                    "objective optimization.")
            self.n_objs = 1

        # If process A suggests a certain guess and runs it, process B may
        # suggest the same guess while process A is running its new workflow.
        # Therefore, process A must reserve the guess. Line below releases
        # reservation on this document in case of workflow failure or end of
        # workflow.
        self.c.delete_one({'x': x, 'y': 'reserved'})

        # fetch additional attributes for constructing ML model
        z = self.get_z(x, *self.get_z_args, **self.get_z_kwargs)

        # use all possible training points as default
        n_completed = self.c.count_documents(self._completed)
        if not self.n_train_pts or self.n_train_pts > n_completed:
            self.n_train_pts = n_completed

        # check if opimization should be done, if in batch mode
        batch_mode = False if self.batch_size == 1 else True
        batch_ready = n_completed not in (0, 1) and (
                n_completed + 1) % self.batch_size == 0

        x = convert_native(x)
        y = convert_native(y)
        z = convert_native(z)

        if batch_mode and not batch_ready:
            # 'None' predictor means this job was not used for
            # an optimization run.
            if self.c.find_one({'x': x}):
                if self.c.find_one({'x': x, 'y': 'reserved'}):
                    # For reserved guesses: update everything
                    self.c.find_one_and_update(
                        {'x': x, 'y': 'reserved'},
                        {'$set': {'y': y, 'z': z, 'z_new': [],
                                  'x_new': [],
                                  'predictor': None,
                                  'index': n_completed + 1}
                         })
                else:
                    # For completed guesses (ie, this workflow
                    # is a forced duplicate), do not update
                    # index, but update everything else
                    self.c.find_one_and_update(
                        {'x': x},
                        {'$set': {'y': y, 'z': z, 'z_new': [], 'x_new': [],
                                  'predictor': None}
                         })
            else:
                # For new guesses: insert x, y, z, index,
                # predictor, and dummy new guesses
                self.c.insert_one({'x': x, 'y': y, 'z': z, 'x_new': [],
                                   'z_new': [], 'predictor': None,
                                   'index': n_completed + 1})
            self.pop_lock(manager_id)
            raise BatchNotReadyError

        # Mongo aggregation framework may give duplicate documents, so we cannot
        # use $sample to randomize the training points used
        searched_indices = random.sample(
            range(1, n_completed + 1), self.n_train_pts)
        searched_docs = self.c.find(
            {'index': {'$in': searched_indices}},
            batch_size=10000)
        reserved_docs = self.c.find({'y': 'reserved'}, batch_size=10000)
        reserved = []
        for doc in reserved_docs:
            reserved.append(doc['x'])
        all_y = [None] * n_completed
        all_y.append(y)
        all_x_searched = [None] * n_completed
        all_x_searched.append(x)
        z = list(z)
        all_xz_searched = [None] * n_completed
        all_xz_searched.append(x + z)
        for i, doc in enumerate(searched_docs):
            all_x_searched[i] = doc['x']
            all_xz_searched[i] = doc['x'] + doc['z']
            all_y[i] = doc['y']

        all_x_space = self._discretize_space(self.x_dims)
        all_x_space = list(all_x_space) if self.z_file else all_x_space
        all_x_unsearched = []
        for xi in all_x_space:
            xj = list(xi)
            if xj not in all_x_searched and xj not in reserved:
                all_x_unsearched.append(xj)
                if len(all_x_unsearched) == self.n_search_pts:
                    break

        if self.z_file:
            if path.exists(self.z_file):
                with open(self.z_file, 'rb') as f:
                    xz_map = pickle.load(f)
            else:
                xz_map = {tuple(xi): self.get_z(xi, *self.get_z_args,
                                                **self.get_z_kwargs)
                          for xi in all_x_space}
                with open(self.z_file, 'wb') as f:
                    pickle.dump(xz_map, f)

            all_xz_unsearched = [xi + xz_map[tuple(xi)] for xi in
                                 all_x_unsearched]
        else:
            all_xz_unsearched = [
                xi + self.get_z(xi, *self.get_z_args, **self.get_z_kwargs) for
                xi in all_x_unsearched]

        # if there are no more unsearched points in the entire
        # space, either they have been searched (ie have x, y,
        # and z) or have been reserved.
        if len(all_xz_unsearched) < 1:
            if self.is_discrete_all:
                raise ExhaustedSpaceError("The discrete space has been searched"
                                          " exhaustively.")
            else:
                raise TypeError("A comprehensive list of points was exhausted "
                                "but the dimensions are not discrete.")
        z_dims = self._z_dims(all_xz_unsearched, all_xz_searched)
        xz_dims = self.x_dims + z_dims

        # run machine learner on Z or X features
        if self.predictor in self.builtin_predictors:
            model = self.builtin_predictors[self.predictor]
            all_xz_searched = self._encode(all_xz_searched, xz_dims)
            all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims)
            all_xz_new_onehot = []
            for _ in range(self.batch_size):
                xz1h = self._predict(all_xz_searched, all_y, all_xz_unsearched,
                                     model(*self.predictor_args,
                                           **self.predictor_kwargs),
                                     self.maximize, scaling=True)
                ix = all_xz_unsearched.index(xz1h)
                all_xz_unsearched.pop(ix)
                all_xz_new_onehot.append(xz1h)
            all_xz_new = [self._decode(xz_onehot, xz_dims) for xz_onehot in
                          all_xz_new_onehot]

        elif self.predictor == 'random':
            all_xz_new = random.sample(all_xz_unsearched, self.batch_size)

        else:
            # If using a custom predictor, automatically convert
            # categorical info to one-hot encoded ints.
            # Used when a custom predictor cannot natively use
            # categorical info
            if self.onehot_categorical:
                all_xz_searched = self._encode(all_xz_searched, xz_dims)
                all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims)

            try:
                predictor_fun = deserialize(self.predictor)
            except Exception as E:
                raise NameError("The custom predictor {} didnt import "
                                "correctly!\n{}".format(self.predictor, E))

            all_xz_new = predictor_fun(all_xz_searched, all_y, self.x_dims,
                                       all_xz_unsearched, *self.predictor_args,
                                       **self.predictor_kwargs)
            if self.onehot_categorical:
                all_xz_new = self._decode(all_xz_new, xz_dims)

            if not isinstance(all_xz_new[0], (list, tuple)):
                all_xz_new = [all_xz_new]

        # duplicate checking for custom optimizer functions
        if self.duplicate_check:

            if not self.enforce_sequential:
                raise ValueError("Duplicate checking cannot work when "
                                 "optimizations are not enforced sequentially.")
            if self.predictor not in self.builtin_predictors and \
                    self.predictor != 'random':
                all_x_new = [split_xz(xz_new, self.x_dims, x_only=True) for
                             xz_new in all_xz_new]
                all_x_searched = [split_xz(xz, self.x_dims, x_only=True) for xz
                                  in all_xz_searched]
                if self.tolerances:
                    for n, x_new in enumerate(all_x_new):
                        if is_duplicate_by_tolerance(x_new, all_x_searched,
                                                     tolerances=self.tolerances):
                            all_xz_new[n] = random.choice(
                                all_xz_unsearched)
                else:
                    if self.is_discrete_all:
                        # test only for x, not xz because custom predicted z
                        # may not be accounted for
                        for n, x_new in enumerate(all_x_new):
                            if x_new in all_x_searched or x_new == x:
                                all_xz_new[n] = random.choice(
                                    all_xz_unsearched)
                    else:
                        raise ValueError("Define tolerances parameter to "
                                         "duplicate check floats.")
        return x, y, z, all_xz_new, n_completed
Пример #10
0
    def run_task(self, fw_spec):
        """
        FireTask for running an optimization loop.

        Args:
            fw_spec (dict): the firetask spec. Must contain a '_y' key with
            a float type field and must contain a '_x' key containing a
            vector uniquely defining the point in search space.

        Returns:
            (FWAction) A workflow based on the workflow creator and a new,
            optimized guess.
        """
        pid = getpid()
        sleeptime = .01
        max_runs = int(self.timeout / sleeptime)
        max_resets = 3

        # Running stepwise optimization for concurrent processes requires a
        # manual 'lock' on the optimization database to prevent duplicate
        # guesses. The first process sets up a manager document which handles
        # locking and queueing processes by PID. The single, active process in
        # the lock is free to access optimization data; the queue of the manager
        # holds parallel process PIDs waiting to access the db. When the active
        # process finishes, it removes itself from the lock and moves the first
        # queue PID into the lock, allowing the next process to begin
        # optimization. Each process continually tries to either queue or place
        # itself into the lock if not active.

        for run in range(max_resets * max_runs):
            manager_count = self.c.count_documents(self._manager)
            if manager_count == 0:
                self.c.insert_one({'lock': pid, 'queue': [],
                                   'doctype': 'manager'})
            elif manager_count == 1:
                # avoid bootup problems if manager lock is being deleted
                # concurrently with this check
                try:
                    manager = self.c.find_one(self._manager)
                    manager_id = manager['_id']
                    lock = manager['lock']
                except TypeError:
                    continue

                if lock is None:
                    self.c.find_one_and_update({'_id': manager_id},
                                               {'$set': {'lock': pid}})

                elif self.enforce_sequential and lock != pid:
                    if pid not in manager['queue']:

                        # avoid bootup problems if manager queue is being
                        # deleted concurrently with this check
                        try:
                            self.c.find_one_and_update({'_id': manager_id},
                                                       {'$push': {'queue': pid}}
                                                       )
                        except TypeError:
                            continue
                    else:
                        sleep(sleeptime)
                elif not self.enforce_sequential or \
                        (self.enforce_sequential and lock == pid):
                    try:
                        x, y, z, all_xz_new, n_completed = \
                            self.optimize(fw_spec, manager_id)
                    except BatchNotReadyError:
                        return None
                    except Exception:
                        self.pop_lock(manager_id)
                        raise

                    # make sure a process has not timed out and changed the lock
                    # pid while this process is computing the next guess
                    try:
                        if self.c.find_one(self._manager)['lock'] != pid or \
                                self.c.count_documents(self._manager) == 0:
                            continue
                        else:
                            opt_id = self.stash(x, y, z, all_xz_new,
                                                n_completed)
                    except TypeError as E:
                        warnings.warn("Process {} probably timed out while "
                                      "computing next guess, with exception {}."
                                      " Try shortening the training time or "
                                      "lengthening the timeout for OptTask!"
                                      "".format(pid, E), RuntimeWarning)
                        raise E
                        # continue
                    self.pop_lock(manager_id)
                    all_x_new = [split_xz(xz_new, self.x_dims, x_only=True)
                                 for xz_new in all_xz_new]
                    if not isinstance(self.wf_creator_args, (list, tuple)):
                        raise TypeError(
                            "wr_creator_args should be a list/tuple of "
                            "positional arguments.")

                    if not isinstance(self.wf_creator_kwargs, dict):
                        raise TypeError(
                            "wr_creator_kwargs should be a dictionary of "
                            "keyword arguments.")

                    new_wfs = [self.wf_creator(x_new, *self.wf_creator_args,
                                               **self.wf_creator_kwargs)
                               for x_new in all_x_new]
                    for wf in new_wfs:
                        self.lpad.add_wf(wf)
                    return FWAction(update_spec={'_optimization_id': opt_id},
                                    stored_data={'_optimization_id': opt_id})
            else:
                # Delete the manager that this has created
                self.c.delete_one({'lock': pid})

            if run in [max_runs * k for k in range(1, max_resets)]:
                self.c.find_one_and_update(self._manager,
                                           {'$set': {'lock': None, 'queue': []}}
                                           )

            elif run == max_runs * max_resets:
                raise Exception("The manager is still stuck after "
                                "resetting. Make sure no stalled processes "
                                "are in the queue.")