예제 #1
0
파일: task.py 프로젝트: ardunn/TurboWorks
    def __init__(self, *args, **kwargs):
        super(OptTask, self).__init__(*args, **kwargs)

        # Configuration attrs
        lp = self.get("launchpad", LaunchPad.auto_load())
        if isinstance(lp, LaunchPad):
            lp = lp.to_dict()
        self.lpad = LaunchPad.from_dict(lp)
        self.opt_label = self.get("opt_label", "opt_default")
        self.c = getattr(self.lpad.db, self.opt_label)
        self.config = self.c.find_one({"doctype": "config"})
        if self.config is None:
            raise NotConfiguredError("Please use MissionControl().configure to "
                                     "configure the optimization database "
                                     "({} - {}) before running OptTask."
                                     "".format(self.lpad.db, self.opt_label))
        self.wf_creator = deserialize(self.config["wf_creator"])
        self.x_dims = self.config["dimensions"]
        self._xdim_types = self.config["dim_types"]
        self.is_discrete_all = self.config["is_discrete_all"]
        self.is_discrete_any = self.config["is_discrete_any"]
        self.wf_creator_args = self.config["wf_creator_args"] or []
        self.wf_creator_kwargs = self.config["wf_creator_kwargs"] or {}
        self.predictor = self.config["predictor"]
        self.predictor_args = self.config["predictor_args"] or []
        self.predictor_kwargs = self.config["predictor_kwargs"] or {}
        self.maximize = self.config["maximize"]
        self.n_search_pts = self.config["n_search_pts"]
        self.n_train_pts = self.config["n_train_pts"]
        self.n_bootstraps = self.config["n_bootstraps"]
        self.acq = self.config["acq"]
        self.space_file = self.config["space_file"]
        self.onehot_categorical = self.config["onehot_categorical"]
        self.duplicate_check = self.config["duplicate_check"]
        self.get_z = self.config["get_z"]
        if self.get_z:
            self.get_z = deserialize(self.config['get_z'])
        else:
            self.get_z = lambda *ars, **kws: []
        self.get_z_args = self.config["get_z_args"] or []
        self.get_z_kwargs = self.config["get_z_kwargs"] or {}
        self.z_file = self.config["z_file"]
        self.enforce_sequential = self.config["enforce_sequential"]
        self.tolerances = self.config["tolerances"]
        self.batch_size = self.config["batch_size"]
        self.timeout = self.config["timeout"]

        # Declared attrs
        self.n_objs = None
        plist = [RandomForestRegressor, GaussianProcessRegressor,
                 ExtraTreesRegressor, GradientBoostingRegressor]
        self.builtin_predictors = {p.__name__: p for p in plist}
        self._n_cats = 0
        self._encoding_info = []

        # Query formats
        self._completed = {'x': {'$exists': 1}, 'y': {'$exists': 1,
                                                      '$ne': 'reserved'},
                           'z': {'$exists': 1}}
        self._manager = {'lock': {'$exists': 1}, 'queue': {'$exists': 1}}
예제 #2
0
    def configure(self, wf_creator, dimensions, **kwargs):
        """
        Set up the optimization config. Required before using OptTask, but only
        needs to be done once. To reconfigure, use MissionControl.reset and then
        use configure again.

        Args:

        wf_creator (function or str): The function object that creates the
            workflow based on a unique vector, x. Alternatively, the full string
            module path to that function, e.g. "mypkg.mymodule.my_wf_creator",
            which must importable and found in PYTHONPATH.
        dimensions ([tuple]): each 2-tuple in the list defines one dimension in
            the search space in (low, high) format.
            For categorical or discontinuous dimensions, includes all possible
            categories or values as a list of any length or a tuple of length>2.
            Example: dimensions = dim = [(1,100), (9.293, 18.2838), ("red",
            "blue", "green")].
        **kwargs: Keyword arguments for defining the optimization. A full list
            of possible kwargs is given below:

            Optimization data:
            opt_label (str): The label to use for this collection of
                optimization data.

            Workflow creator function:
            wf_creator_args (list): the positional args to be passed to the
                wf_creator function alongsize the new x vector
            wf_creator_kwargs (dict): details the kwargs to be passed to the
                wf_creator function alongside the new x vector

            Predictors (optimization):
            predictor (function or str): a function which given a list of
                searched points and unsearched points, returns an optimized
                guess.

                To use a builtin predictor, pass in one of:
                    'GaussianProcessRegressor',
                    'RandomForestRegressor',
                    'ExtraTreesRegressor',
                    'GradientBoostingRegressor',
                    'random' (random guess)
                The default is 'GaussianProcessRegressor'

                To use a custom predictor, pass in the function object.
                Alternatively, the full string module path to that function,
                e.g. "mypkg.mymodule.my_predictor", which must importable and
                found in PYTHONPATH.
                Example builtin predictor: 'GaussianProcessRegressor'
                Example custom predictor: my_predictor
                Example custom predictor 2: 'my_pkg.my_module.my_predictor'
            predictor_args (list): the positional args to be passed to the model
                along with a list of points to be searched. For sklearn-based
                predictors included in OptTask, these positional args are passed
                to the init method of the chosen model. For custom predictors,
                these are passed to the chosen predictor function alongside the
                searched guesses, the output from searched guesses, and an
                unsearched space to be used with optimization.
            predictor_kwargs (dict): the kwargs to be passed to the model.
                Similar to predictor_args.
            n_search_pts (int): The number of points to be searched in the
                search space when choosing the next best point. Choosing more
                points to search may increase the effectiveness of the
                optimization but take longer to evaluate. The default is 1000.
            n_train_pts (int): The number of already explored points to be
                chosen for training. Default is None, meaning all available
                points will be used for training. Reduce the number of points to
                decrease training times.
            n_bootstraps (int): The number of times each optimization should,
                sample, train, and predict values when generating uncertainty
                estimates for prediction. At least 10 data points must be
                present for bootstrapping. Not used if: acq not specified,
                custom predictor used, or GaussianProcessRegressor used.
            acq (str): The acquisition function to use. Can be 'ei' for expected
                improvement, 'pi' for probability of improvement, or 'lcb' for
                lower confidence bound. Defaults to None, which means no
                acquisition function is used, and the highest predicted point is
                picked (greedy algorithm). Only applies to builtin predictors.
            space_file (str): The fully specified path of a pickle file
                containing a list of all possible searchable vectors.
                For example '/Users/myuser/myfolder/myspace.p'. When loaded,
                this space_file should be a list of tuples.
            onehot_categorical (bool): If True, preprocesses categorical data
                (strings) to one-hot encoded binary arrays for use with custom
                predictor functions. Default False.
            duplicate_check (bool): If True, checks that custom optimizers are
                not making duplicate guesses; all built-in optimizers cannot
                duplicate guess. If the custom predictor suggests a duplicate,
                OptTask picks a random guess out of the remaining untried space.
                Default is no duplicate check, and an error is raised if a
                duplicate is suggested.
            tolerances (list): The tolerance of each feature when duplicate
                checking. For categorical features, put 'None'
                Example: Our dimensions are [(1, 100), ['red', 'blue'],
                (2.0, 20.0)]. We want our first parameter to be  a duplicate
                only if it is exact, and our third parameter to be a duplicate
                if it is within 1e-6. Then:
                    tolerances=[0, None, 1e-6]

            z-vector features:
            get_z (string): the fully-qualified name of a function which, given
                an x vector, returns another vector z which provides extra
                information to the machine learner. The features defined in z
                are not used to run the workflow, but are used for learning. If
                z_features are enabled, ONLY z features will be used for
                learning (x vectors essentially become tags or identifiers only)
                Examples:
                    get_z = 'my_pkg.my_module.my_fun'
                    get_z = '/path/to/folder/containing/my_dir/my_module.my_fun'
            get_z_args (list): the positional arguments to be passed to the
                get_z function alongside x
            get_z_kwargs (dict): the kwargs to be passed to the get_z function
                alongside x
            z_file (str): The filename (pickle file) where OptTask should save
                /cache z calculations. Specify this argument if calculating z
                for many (n_search_pts) is not trivial and will cost time in
                computing. With this argument specified, each z  will only be
                calculated once. Defaults to None, meaning that all unexplored z
                are re-calculated each iteration.
                Example:
                    z_file = '/path/to/z_guesses.p'

            Parallelism:
            enforce_sequential (bool): WARNING: Experimental feature! If True,
                enforces that RS optimizations are run sequentially (default),
                which prevents duplicate guesses from ever being run. If False,
                allows OptTasks to run optimizations in parallel, which may
                cause duplicate guesses with high parallelism.
            batch_size (int): The number of jobs to submit per batch for a batch
                optimization. For example, batch_size=5 will optimize every 5th
                job, then submit another 5 jobs based on the best 5 predictions
                (recomputing the acquisition function after each prediction).
            timeout (int): The number of seconds to wait before resetting the
                lock on the db.

        Returns:
            None: If you want to run the OptTask workflow, you'll need to pass
            in the launchpad and opt_label arguments in your wf_creator.
        """
        config = get_default_opttask_kwargs()
        config["launchpad"] = self.launchpad.to_db_dict()
        config["opt_label"] = self.opt_label
        for kw in kwargs.keys():
            if kw not in config:
                raise KeyError(
                    "{} not a valid argument for setup_config. Choose "
                    "from: {}".format(kw, list(config.keys())))
            elif kw in ["get_z", "predictor"]:
                if hasattr(kwargs[kw], '__call__'):
                    config[kw] = serialize(kwargs[kw])
            else:
                config[kw] = kwargs[kw]
        if hasattr(wf_creator, '__call__'):
            wf_creator = serialize(wf_creator)
        config["wf_creator"] = wf_creator
        config["dimensions"] = dimensions

        # Determine data types of dimensions
        config["dim_types"] = check_dims(dimensions)
        config["is_discrete_any"] = is_discrete(dimensions, criteria="any")
        config["is_discrete_all"] = is_discrete(dimensions, criteria="all")

        # Ensure importable functions are importable
        try:
            deserialize(wf_creator)
        except ImportError as IE:
            self.logger.warn("wf_creator " + IMPORT_WARNING + "\n" + str(IE))
        try:
            pre = config["predictor"]
            if pre:
                if "." in pre:
                    deserialize(pre)
        except ImportError as IE:
            self.logger.warn("predictor " + IMPORT_WARNING + "\n" + str(IE))
        try:
            getz = config["get_z"]
            if getz:
                if "." in getz:
                    deserialize(getz)
        except ImportError as IE:
            self.logger.warn("get_z " + IMPORT_WARNING + "\n" + str(IE))

        # Ensure acquisition function is valid (for builtin predictors)
        acq_funcs = [None, 'ei', 'pi', 'lcb', 'maximin']
        if config['acq'] not in acq_funcs:
            raise ValueError(
                "Invalid acquisition function. Use 'ei', 'pi', 'lcb', "
                "'maximin' (multiobjective), or None.")
        config["doctype"] = "config"
        self.config = config
        if self.c.find_one({"doctype": "config"}):
            raise ValueError("A config is already present in this Launchpad "
                             "for opt_label=={}. Please use the MissionControl"
                             " reset method to reset the database config."
                             "".format(self.opt_label))
        else:
            self.c.insert_one(self.config)
            self.logger.info("Rocketsled configuration succeeded.")
        self.is_configured = True
예제 #3
0
def auto_setup(func, dimensions, wfname=None, launch_ready=False, **kwargs):
    """
    Automatically set up a FireWorks-based optimization loop with OptTask with
    you own function.

    The loop is set up as a script, which is written to the
    rocketsled/auto_sleds directory. All you need to do it set up your
    Fireworks launchpad and run the script created, to get started.

    Make sure to pass in necessary launchpad data to OptTask through kwargs of
    this function!

    Args:

        func (function object): A function object accepting a single positional
            argument, x, a vector of ints/floats/strs and returning a single
            scalar, y.
        wfname (str): The base name you want for the workflow.
        dimensions (list): A list of dimensions constraining each of the
            variables in x. Each 2-tuple in the list defines one dimension in
            the search space in (low, high) format. For categorical dimensions,
            includes all possible categories as a list.
            Example:
            dimensions = [(1,100), (9.293, 18.2838), ("red", "blue", "green")]
        launch_ready (bool): If True, the created document can be executed
            immediately.
        kwargs: Arguments to be passed as options to OptTask. Valid arguments
            to be passed are any valid args for OptTask. For example,
            lpad, host, port, name, opt_label, acq, predictor, etc...

    """
    # Determine the name and directory
    dir = os.path.dirname(os.path.abspath(__file__)) + '/auto_sleds'
    time_now = datetime.datetime.utcnow().strftime(FW_BLOCK_FORMAT)

    if not os.path.exists(dir):
        os.mkdir(dir)
    if not os.path.exists(dir + "/__init__.py"):
        with open(dir + "/__init__.py", "w") as ipy:
            ipy.write('"""\n This file has been autocreated by '
                      'auto_setup.py\n"""')
    if wfname:
        if "/" in wfname or " " in wfname:
            raise ValueError("Please do not use ' ' or '/' in the wf name.")
    else:
        wfname = "autosled_" + time_now

    filename = dir + "/" + wfname + ".py"
    if os.path.exists(filename):
        warnings.warn("That workflow file has already been created! Appending "
                      "the current datetime to the filename.")
        filename = dir + "/" + wfname + "_" + time_now + ".py"
        wfname += "_" + time_now

    wfc = "rocketsled.auto_sleds." + wfname + ".wf_creator"
    kwargs['wf_creator'] = wfc
    kwargs['dimensions'] = dimensions

    if func is not None:
        if not hasattr(func, '__call__'):
            raise TypeError("func must be a callable function.")
        rawfunc = os.path.abspath(sys.modules.get(func.__module__).__file__)
        if rawfunc.endswith(".pyc"):
            rawfunc = rawfunc[:-3] + func.__name__
        else:
            rawfunc = rawfunc[:-2] + func.__name__

        # See if import will work
        try:
            deserialize(rawfunc)
        except AttributeError:
            warnings.warn("Import attempt failed! File will still be written, "
                          "but if launching does not work, make sure the "
                          "function exists, is named properly, and "
                          "the string argument of the functions location is "
                          "/the/full/path/to/my/module.myfunc")

        funcpath = "rocketsled.auto_sleds." + wfname + ".f"

        # for the name of the workflow,
        # prevent fws name indexing from causing bson errors with large dims
        # also prevents webgui ugliness with large dims
        pointstr = " @ ' + str(x)" if len(dimensions) < 5 else "'"

        with open(filename, 'w') as f:
            try:
                f.write("from __future__ import unicode_literals\n")
                f.write('"""\n')
                f.write(
                    "This is an automatically created script from auto_setup.\n"
                    "If you are not comfortable working with FireWorks and "
                    "PyTask, do NOT move this\nfile out this directory or "
                    "rename it if you want to run this workflow!\n\nIf you are"
                    " comfortable working with FireWorks and PyTask, feel f"
                    "ree to edit\nand/or move this file to suit your needs. "
                    "See the OptTask documentation and the\nexamples for more "
                    "information on setting up workflow creators.\n")
                f.write('"""\n')
                f.write("from fireworks import PyTask, Firework, Workflow, "
                        "LaunchPad\n")
                f.write(
                    "from fireworks.core.rocket_launcher import rapidfire\n")
                f.write("from rocketsled.utils import deserialize, "
                        "random_guess\n")
                f.write("from rocketsled import OptTask\n\n\n")
                f.write(
                    "# This is your function, imported to rocketsled to use"
                    " with PyTask.\n")
                f.write("f = deserialize('" + rawfunc + "')\n\n")
                f.write("def wf_creator(x):\n")
                f.write("    spec = {'_x_opt':x}\n")
                f.write("    pt = " + PyTask_as_string(funcpath) + "\n")
                f.write("    ot = " + OptTask_as_string(**kwargs) + "\n")
                f.write(
                    "    fw0 = Firework([pt], spec=spec, name='PyTaskFW')\n")
                f.write("    fw1 = Firework([ot], spec=spec, "
                        "name='RocketsledFW')\n")
                f.write("    wf = Workflow([fw0, fw1], {fw0: [fw1], fw1: []},"
                        " name='" + wfname + pointstr + ")\n")
                f.write("    return wf\n")
                f.write("\n\nif __name__=='__main__': \n\n")
                f.write(
                    "    # Make sure the launchpad below is correct, and "
                    "make changes if necessary if\n    # it does not match "
                    "the OptTask db ^^^:\n")
                if all(s in kwargs for s in ['host', 'port', 'name']):
                    h = kwargs['host']
                    p = kwargs['port']
                    n = kwargs['name']
                    f.write("    lpad = LaunchPad(host='{}', port={}, "
                            "name='{}')\n".format(h, p, n))
                elif 'lpad' in kwargs:
                    if isinstance(kwargs['lpad'], LaunchPad):
                        lpad = kwargs['lpad'].to_dict()
                    else:
                        lpad = kwargs['lpad']
                    f.write("    lpad = LaunchPad.from_dict(" + lpad + ")\n")
                else:
                    f.write("    lpad = LaunchPad.auto_load()\n")
                f.write(
                    "    # lpad.reset(password=None, require_password=False"
                    ")\n")
                f.write("\n    # Define your workflow to start...\n")
                f.write("    wf1 = wf_creator(random_guess(" +
                        str(dimensions) + "))\n\n")
                f.write("    # Add it to the launchpad and launch!\n")
                f.write("    lpad.add_wf(wf1)\n")
                if launch_ready:
                    f.write("    rapidfire(lpad, nlaunches=5, sleep_time=0)")
                else:
                    f.write("    # rapidfire(lpad, nlaunches=5, sleep_time=0)")

            except Exception:
                raise
            print("\nFile successfully created!\nFind your auto sled at "
                  "{}\n".format(filename))

    else:
        raise ValueError("Please specify a callable function or a properly"
                         "formatted string location of the function")
    return filename
예제 #4
0
    def __init__(self, *args, **kwargs):
        super(OptTask, self).__init__(*args, **kwargs)

        # Configuration attrs
        lp = self.get("launchpad", LaunchPad.auto_load())
        if isinstance(lp, LaunchPad):
            lp = lp.to_dict()
        self.lpad = LaunchPad.from_dict(lp)
        self.opt_label = self.get("opt_label", "opt_default")
        self.c = getattr(self.lpad.db, self.opt_label)
        self.config = self.c.find_one({"doctype": "config"})
        if self.config is None:
            raise NotConfiguredError(
                "Please use MissionControl().configure to "
                "configure the optimization database "
                "({} - {}) before running OptTask."
                "".format(self.lpad.db, self.opt_label))
        self.wf_creator = deserialize(self.config["wf_creator"])
        self.x_dims = self.config["dimensions"]
        self._xdim_types = self.config["dim_types"]
        self.is_discrete_all = self.config["is_discrete_all"]
        self.is_discrete_any = self.config["is_discrete_any"]
        self.wf_creator_args = self.config["wf_creator_args"] or []
        self.wf_creator_kwargs = self.config["wf_creator_kwargs"] or {}
        self.predictor = self.config["predictor"]
        self.predictor_args = self.config["predictor_args"] or []
        self.predictor_kwargs = self.config["predictor_kwargs"] or {}
        self.maximize = self.config["maximize"]
        self.n_search_pts = self.config["n_search_pts"]
        self.n_train_pts = self.config["n_train_pts"]
        self.n_bootstraps = self.config["n_bootstraps"]
        self.acq = self.config["acq"]
        self.space_file = self.config["space_file"]
        self.onehot_categorical = self.config["onehot_categorical"]
        self.duplicate_check = self.config["duplicate_check"]
        self.get_z = self.config["get_z"]
        if self.get_z:
            self.get_z = deserialize(self.config["get_z"])
        else:
            self.get_z = lambda *ars, **kws: []
        self.get_z_args = self.config["get_z_args"] or []
        self.get_z_kwargs = self.config["get_z_kwargs"] or {}
        self.z_file = self.config["z_file"]
        self.enforce_sequential = self.config["enforce_sequential"]
        self.tolerances = self.config["tolerances"]
        self.batch_size = self.config["batch_size"]
        self.timeout = self.config["timeout"]

        # Declared attrs
        self.n_objs = None
        self.builtin_predictors = {p.__name__: p for p in BUILTIN_PREDICTORS}
        self._n_cats = 0
        self._encoding_info = []

        # Query formats
        self._completed = {
            "x": {
                "$exists": 1
            },
            "y": {
                "$exists": 1,
                "$ne": "reserved"
            },
            "z": {
                "$exists": 1
            },
        }
        self._manager = {"lock": {"$exists": 1}, "queue": {"$exists": 1}}
예제 #5
0
    def optimize(self, fw_spec, manager_id):
        """
        Run the optimization algorithm.

        Args:
            fw_spec (dict): The firework spec.
            manager_id (ObjectId): The MongoDB object id of the manager
                document.

        Returns:
            x (iterable): The current x guess.
            y: (iterable): The current y (objective function) value
            z (iterable): The z vector associated with x
            all_xz_new ([list] or [tuple]): The predicted next best guess(es),
                including their associated z vectors
            n_completed (int): The number of completed guesses/workflows
        """
        x = list(fw_spec["_x"])
        y = fw_spec["_y"]
        if isinstance(y, (list, tuple)):
            if len(y) == 1:
                y = y[0]
            self.n_objs = len(y)
            if self.acq not in ("maximin", None):
                raise ValueError(
                    "{} is not a valid acquisition function for multiobjective "
                    "optimization".format(self.acq))
        else:
            if self.acq == "maximin":
                raise ValueError(
                    "Maximin is not a valid acquisition function for single "
                    "objective optimization.")
            self.n_objs = 1

        # If process A suggests a certain guess and runs it, process B may
        # suggest the same guess while process A is running its new workflow.
        # Therefore, process A must reserve the guess. Line below releases
        # reservation on this document in case of workflow failure or end of
        # workflow.
        self.c.delete_one({"x": x, "y": "reserved"})

        # fetch additional attributes for constructing ML model
        z = self.get_z(x, *self.get_z_args, **self.get_z_kwargs)

        # use all possible training points as default
        n_completed = self.c.count_documents(self._completed)
        if not self.n_train_pts or self.n_train_pts > n_completed:
            self.n_train_pts = n_completed

        # check if opimization should be done, if in batch mode
        batch_mode = False if self.batch_size == 1 else True
        batch_ready = (n_completed not in (0, 1)
                       and (n_completed + 1) % self.batch_size == 0)

        x = convert_native(x)
        y = convert_native(y)
        z = convert_native(z)

        if batch_mode and not batch_ready:
            # 'None' predictor means this job was not used for
            # an optimization run.
            if self.c.find_one({"x": x}):
                if self.c.find_one({"x": x, "y": "reserved"}):
                    # For reserved guesses: update everything
                    self.c.find_one_and_update(
                        {
                            "x": x,
                            "y": "reserved"
                        },
                        {
                            "$set": {
                                "y": y,
                                "z": z,
                                "z_new": [],
                                "x_new": [],
                                "predictor": None,
                                "index": n_completed + 1,
                            }
                        },
                    )
                else:
                    # For completed guesses (ie, this workflow
                    # is a forced duplicate), do not update
                    # index, but update everything else
                    self.c.find_one_and_update(
                        {"x": x},
                        {
                            "$set": {
                                "y": y,
                                "z": z,
                                "z_new": [],
                                "x_new": [],
                                "predictor": None,
                            }
                        },
                    )
            else:
                # For new guesses: insert x, y, z, index,
                # predictor, and dummy new guesses
                self.c.insert_one({
                    "x": x,
                    "y": y,
                    "z": z,
                    "x_new": [],
                    "z_new": [],
                    "predictor": None,
                    "index": n_completed + 1,
                })
            self.pop_lock(manager_id)
            raise BatchNotReadyError

        # Mongo aggregation framework may give duplicate documents, so we cannot
        # use $sample to randomize the training points used
        searched_indices = random.sample(range(1, n_completed + 1),
                                         self.n_train_pts)
        searched_docs = self.c.find({"index": {
            "$in": searched_indices
        }},
                                    batch_size=10000)
        reserved_docs = self.c.find({"y": "reserved"}, batch_size=10000)
        reserved = []
        for doc in reserved_docs:
            reserved.append(doc["x"])
        all_y = [None] * n_completed
        all_y.append(y)
        all_x_searched = [None] * n_completed
        all_x_searched.append(x)
        z = list(z)
        all_xz_searched = [None] * n_completed
        all_xz_searched.append(x + z)
        for i, doc in enumerate(searched_docs):
            all_x_searched[i] = doc["x"]
            all_xz_searched[i] = doc["x"] + doc["z"]
            all_y[i] = doc["y"]

        all_x_space = self._discretize_space(self.x_dims)
        all_x_space = list(all_x_space) if self.z_file else all_x_space
        all_x_unsearched = []
        for xi in all_x_space:
            xj = list(xi)
            if xj not in all_x_searched and xj not in reserved:
                all_x_unsearched.append(xj)
                if len(all_x_unsearched) == self.n_search_pts:
                    break

        if self.z_file:
            if path.exists(self.z_file):
                with open(self.z_file, "rb") as f:
                    xz_map = pickle.load(f)
            else:
                xz_map = {
                    tuple(xi): self.get_z(xi, *self.get_z_args,
                                          **self.get_z_kwargs)
                    for xi in all_x_space
                }
                with open(self.z_file, "wb") as f:
                    pickle.dump(xz_map, f)

            all_xz_unsearched = [
                xi + xz_map[tuple(xi)] for xi in all_x_unsearched
            ]
        else:
            all_xz_unsearched = [
                xi + self.get_z(xi, *self.get_z_args, **self.get_z_kwargs)
                for xi in all_x_unsearched
            ]

        # if there are no more unsearched points in the entire
        # space, either they have been searched (ie have x, y,
        # and z) or have been reserved.
        if len(all_xz_unsearched) < 1:
            if self.is_discrete_all:
                raise ExhaustedSpaceError(
                    "The discrete space has been searched"
                    " exhaustively.")
            else:
                raise TypeError("A comprehensive list of points was exhausted "
                                "but the dimensions are not discrete.")
        z_dims = self._z_dims(all_xz_unsearched, all_xz_searched)
        xz_dims = self.x_dims + z_dims

        # run machine learner on Z or X features
        if self.predictor in self.builtin_predictors:
            model = self.builtin_predictors[self.predictor]
            all_xz_searched = self._encode(all_xz_searched, xz_dims)
            all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims)
            all_xz_new_onehot = []
            for _ in range(self.batch_size):
                xz1h = self._predict(
                    all_xz_searched,
                    all_y,
                    all_xz_unsearched,
                    model(*self.predictor_args, **self.predictor_kwargs),
                    self.maximize,
                    scaling=True,
                )
                ix = all_xz_unsearched.index(xz1h)
                all_xz_unsearched.pop(ix)
                all_xz_new_onehot.append(xz1h)
            all_xz_new = [
                self._decode(xz_onehot, xz_dims)
                for xz_onehot in all_xz_new_onehot
            ]

        elif self.predictor == "random":
            all_xz_new = random.sample(all_xz_unsearched, self.batch_size)

        else:
            # If using a custom predictor, automatically convert
            # categorical info to one-hot encoded ints.
            # Used when a custom predictor cannot natively use
            # categorical info
            if self.onehot_categorical:
                all_xz_searched = self._encode(all_xz_searched, xz_dims)
                all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims)

            try:
                predictor_fun = deserialize(self.predictor)
            except Exception as E:
                raise NameError("The custom predictor {} didnt import "
                                "correctly!\n{}".format(self.predictor, E))

            all_xz_new = predictor_fun(
                all_xz_searched,
                all_y,
                self.x_dims,
                all_xz_unsearched,
                *self.predictor_args,
                **self.predictor_kwargs,
            )
            if self.onehot_categorical:
                all_xz_new = self._decode(all_xz_new, xz_dims)

            if not isinstance(all_xz_new[0], (list, tuple)):
                all_xz_new = [all_xz_new]

        # duplicate checking for custom optimizer functions
        if self.duplicate_check:

            if not self.enforce_sequential:
                raise ValueError(
                    "Duplicate checking cannot work when "
                    "optimizations are not enforced sequentially.")
            if (self.predictor not in self.builtin_predictors
                    and self.predictor != "random"):
                all_x_new = [
                    split_xz(xz_new, self.x_dims, x_only=True)
                    for xz_new in all_xz_new
                ]
                all_x_searched = [
                    split_xz(xz, self.x_dims, x_only=True)
                    for xz in all_xz_searched
                ]
                if self.tolerances:
                    for n, x_new in enumerate(all_x_new):
                        if is_duplicate_by_tolerance(
                                x_new, all_x_searched,
                                tolerances=self.tolerances):
                            all_xz_new[n] = random.choice(all_xz_unsearched)
                else:
                    if self.is_discrete_all:
                        # test only for x, not xz because custom predicted z
                        # may not be accounted for
                        for n, x_new in enumerate(all_x_new):
                            if x_new in all_x_searched or x_new == x:
                                all_xz_new[n] = random.choice(
                                    all_xz_unsearched)
                    else:
                        raise ValueError("Define tolerances parameter to "
                                         "duplicate check floats.")
        return x, y, z, all_xz_new, n_completed
예제 #6
0
파일: task.py 프로젝트: ardunn/TurboWorks
    def optimize(self, fw_spec, manager_id):
        """
        Run the optimization algorithm.

        Args:
            fw_spec (dict): The firework spec.
            manager_id (ObjectId): The MongoDB object id of the manager
                document.

        Returns:
            x (iterable): The current x guess.
            y: (iterable): The current y (objective function) value
            z (iterable): The z vector associated with x
            all_xz_new ([list] or [tuple]): The predicted next best guess(es),
                including their associated z vectors
            n_completed (int): The number of completed guesses/workflows
        """
        x = list(fw_spec['_x'])
        y = fw_spec['_y']
        if isinstance(y, (list, tuple)):
            if len(y) == 1:
                y = y[0]
            self.n_objs = len(y)
            if self.acq not in ("maximin", None):
                raise ValueError(
                    "{} is not a valid acquisition function for multiobjective "
                    "optimization".format(self.acq))
        else:
            if self.acq == "maximin":
                raise ValueError(
                    "Maximin is not a valid acquisition function for single "
                    "objective optimization.")
            self.n_objs = 1

        # If process A suggests a certain guess and runs it, process B may
        # suggest the same guess while process A is running its new workflow.
        # Therefore, process A must reserve the guess. Line below releases
        # reservation on this document in case of workflow failure or end of
        # workflow.
        self.c.delete_one({'x': x, 'y': 'reserved'})

        # fetch additional attributes for constructing ML model
        z = self.get_z(x, *self.get_z_args, **self.get_z_kwargs)

        # use all possible training points as default
        n_completed = self.c.count_documents(self._completed)
        if not self.n_train_pts or self.n_train_pts > n_completed:
            self.n_train_pts = n_completed

        # check if opimization should be done, if in batch mode
        batch_mode = False if self.batch_size == 1 else True
        batch_ready = n_completed not in (0, 1) and (
                n_completed + 1) % self.batch_size == 0

        x = convert_native(x)
        y = convert_native(y)
        z = convert_native(z)

        if batch_mode and not batch_ready:
            # 'None' predictor means this job was not used for
            # an optimization run.
            if self.c.find_one({'x': x}):
                if self.c.find_one({'x': x, 'y': 'reserved'}):
                    # For reserved guesses: update everything
                    self.c.find_one_and_update(
                        {'x': x, 'y': 'reserved'},
                        {'$set': {'y': y, 'z': z, 'z_new': [],
                                  'x_new': [],
                                  'predictor': None,
                                  'index': n_completed + 1}
                         })
                else:
                    # For completed guesses (ie, this workflow
                    # is a forced duplicate), do not update
                    # index, but update everything else
                    self.c.find_one_and_update(
                        {'x': x},
                        {'$set': {'y': y, 'z': z, 'z_new': [], 'x_new': [],
                                  'predictor': None}
                         })
            else:
                # For new guesses: insert x, y, z, index,
                # predictor, and dummy new guesses
                self.c.insert_one({'x': x, 'y': y, 'z': z, 'x_new': [],
                                   'z_new': [], 'predictor': None,
                                   'index': n_completed + 1})
            self.pop_lock(manager_id)
            raise BatchNotReadyError

        # Mongo aggregation framework may give duplicate documents, so we cannot
        # use $sample to randomize the training points used
        searched_indices = random.sample(
            range(1, n_completed + 1), self.n_train_pts)
        searched_docs = self.c.find(
            {'index': {'$in': searched_indices}},
            batch_size=10000)
        reserved_docs = self.c.find({'y': 'reserved'}, batch_size=10000)
        reserved = []
        for doc in reserved_docs:
            reserved.append(doc['x'])
        all_y = [None] * n_completed
        all_y.append(y)
        all_x_searched = [None] * n_completed
        all_x_searched.append(x)
        z = list(z)
        all_xz_searched = [None] * n_completed
        all_xz_searched.append(x + z)
        for i, doc in enumerate(searched_docs):
            all_x_searched[i] = doc['x']
            all_xz_searched[i] = doc['x'] + doc['z']
            all_y[i] = doc['y']

        all_x_space = self._discretize_space(self.x_dims)
        all_x_space = list(all_x_space) if self.z_file else all_x_space
        all_x_unsearched = []
        for xi in all_x_space:
            xj = list(xi)
            if xj not in all_x_searched and xj not in reserved:
                all_x_unsearched.append(xj)
                if len(all_x_unsearched) == self.n_search_pts:
                    break

        if self.z_file:
            if path.exists(self.z_file):
                with open(self.z_file, 'rb') as f:
                    xz_map = pickle.load(f)
            else:
                xz_map = {tuple(xi): self.get_z(xi, *self.get_z_args,
                                                **self.get_z_kwargs)
                          for xi in all_x_space}
                with open(self.z_file, 'wb') as f:
                    pickle.dump(xz_map, f)

            all_xz_unsearched = [xi + xz_map[tuple(xi)] for xi in
                                 all_x_unsearched]
        else:
            all_xz_unsearched = [
                xi + self.get_z(xi, *self.get_z_args, **self.get_z_kwargs) for
                xi in all_x_unsearched]

        # if there are no more unsearched points in the entire
        # space, either they have been searched (ie have x, y,
        # and z) or have been reserved.
        if len(all_xz_unsearched) < 1:
            if self.is_discrete_all:
                raise ExhaustedSpaceError("The discrete space has been searched"
                                          " exhaustively.")
            else:
                raise TypeError("A comprehensive list of points was exhausted "
                                "but the dimensions are not discrete.")
        z_dims = self._z_dims(all_xz_unsearched, all_xz_searched)
        xz_dims = self.x_dims + z_dims

        # run machine learner on Z or X features
        if self.predictor in self.builtin_predictors:
            model = self.builtin_predictors[self.predictor]
            all_xz_searched = self._encode(all_xz_searched, xz_dims)
            all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims)
            all_xz_new_onehot = []
            for _ in range(self.batch_size):
                xz1h = self._predict(all_xz_searched, all_y, all_xz_unsearched,
                                     model(*self.predictor_args,
                                           **self.predictor_kwargs),
                                     self.maximize, scaling=True)
                ix = all_xz_unsearched.index(xz1h)
                all_xz_unsearched.pop(ix)
                all_xz_new_onehot.append(xz1h)
            all_xz_new = [self._decode(xz_onehot, xz_dims) for xz_onehot in
                          all_xz_new_onehot]

        elif self.predictor == 'random':
            all_xz_new = random.sample(all_xz_unsearched, self.batch_size)

        else:
            # If using a custom predictor, automatically convert
            # categorical info to one-hot encoded ints.
            # Used when a custom predictor cannot natively use
            # categorical info
            if self.onehot_categorical:
                all_xz_searched = self._encode(all_xz_searched, xz_dims)
                all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims)

            try:
                predictor_fun = deserialize(self.predictor)
            except Exception as E:
                raise NameError("The custom predictor {} didnt import "
                                "correctly!\n{}".format(self.predictor, E))

            all_xz_new = predictor_fun(all_xz_searched, all_y, self.x_dims,
                                       all_xz_unsearched, *self.predictor_args,
                                       **self.predictor_kwargs)
            if self.onehot_categorical:
                all_xz_new = self._decode(all_xz_new, xz_dims)

            if not isinstance(all_xz_new[0], (list, tuple)):
                all_xz_new = [all_xz_new]

        # duplicate checking for custom optimizer functions
        if self.duplicate_check:

            if not self.enforce_sequential:
                raise ValueError("Duplicate checking cannot work when "
                                 "optimizations are not enforced sequentially.")
            if self.predictor not in self.builtin_predictors and \
                    self.predictor != 'random':
                all_x_new = [split_xz(xz_new, self.x_dims, x_only=True) for
                             xz_new in all_xz_new]
                all_x_searched = [split_xz(xz, self.x_dims, x_only=True) for xz
                                  in all_xz_searched]
                if self.tolerances:
                    for n, x_new in enumerate(all_x_new):
                        if is_duplicate_by_tolerance(x_new, all_x_searched,
                                                     tolerances=self.tolerances):
                            all_xz_new[n] = random.choice(
                                all_xz_unsearched)
                else:
                    if self.is_discrete_all:
                        # test only for x, not xz because custom predicted z
                        # may not be accounted for
                        for n, x_new in enumerate(all_x_new):
                            if x_new in all_x_searched or x_new == x:
                                all_xz_new[n] = random.choice(
                                    all_xz_unsearched)
                    else:
                        raise ValueError("Define tolerances parameter to "
                                         "duplicate check floats.")
        return x, y, z, all_xz_new, n_completed
예제 #7
0
 def test_deserialize(self):
     cwd = os.path.dirname(os.path.realpath(__file__))
     funcstr = cwd + "/deserialize_func.obj_func"
     f = deserialize(funcstr)
     self.assertEqual(f([1, 2, 3]), 6)
     self.assertAlmostEqual(f([1.0, 2.0, 3.0]), 6.0)
예제 #8
0
    def configure(self, wf_creator, dimensions, **kwargs):
        """
        Set up the optimization config. Required before using OptTask, but only
        needs to be done once. To reconfigure, use MissionControl.reset and then
        use configure again.

        Defaults can be found in defaults.yaml.

        Args:

        wf_creator (function or str): The function object that creates the
            workflow based on a unique vector, x. Alternatively, the full string
            module path to that function, e.g. "mypkg.mymodule.my_wf_creator",
            which must importable and found in PYTHONPATH.
        dimensions ([tuple]): each 2-tuple in the list defines one dimension in
            the search space in (low, high) format.
            For categorical or discontinuous dimensions, includes all possible
            categories or values as a list of any length or a tuple of length>2.
            Example: dimensions = dim = [(1,100), (9.293, 18.2838), ("red",
            "blue", "green")].
        **kwargs: Keyword arguments for defining the optimization. A full list
            of possible kwargs is given below:

            Optimization data:
            opt_label (str): The label to use for this collection of
                optimization data.

            Workflow creator function:
            wf_creator_args (list): the positional args to be passed to the
                wf_creator function alongsize the new x vector
            wf_creator_kwargs (dict): details the kwargs to be passed to the
                wf_creator function alongside the new x vector

            Predictors (optimization):
            predictor (function or str): a function which given a list of
                searched points and unsearched points, returns an optimized
                guess.

                To use a builtin predictor, pass in one of:
                    'GaussianProcessRegressor',
                    'RandomForestRegressor',
                    'ExtraTreesRegressor',
                    'GradientBoostingRegressor',
                    'random' (random guess)
                The default is 'GaussianProcessRegressor'

                To use a custom predictor, pass in the function object.
                Alternatively, the full string module path to that function,
                e.g. "mypkg.mymodule.my_predictor", which must importable and
                found in PYTHONPATH.
                Example builtin predictor: 'GaussianProcessRegressor'
                Example custom predictor: my_predictor
                Example custom predictor 2: 'my_pkg.my_module.my_predictor'
            predictor_args (list): the positional args to be passed to the model
                along with a list of points to be searched. For sklearn-based
                predictors included in OptTask, these positional args are passed
                to the init method of the chosen model. For custom predictors,
                these are passed to the chosen predictor function alongside the
                searched guesses, the output from searched guesses, and an
                unsearched space to be used with optimization.
            predictor_kwargs (dict): the kwargs to be passed to the model.
                Similar to predictor_args.
            n_search_pts (int): The number of points to be searched in the
                search space when choosing the next best point. Choosing more
                points to search may increase the effectiveness of the
                optimization but take longer to evaluate. The default is 1000.
            n_train_pts (int): The number of already explored points to be
                chosen for training. Default is None, meaning all available
                points will be used for training. Reduce the number of points to
                decrease training times.
            n_bootstraps (int): The number of times each optimization should,
                sample, train, and predict values when generating uncertainty
                estimates for prediction. At least 10 data points must be
                present for bootstrapping. Not used if: acq not specified,
                custom predictor used, or GaussianProcessRegressor used.
            acq (str): The acquisition function to use. Can be 'ei' for expected
                improvement, 'pi' for probability of improvement, or 'lcb' for
                lower confidence bound, or None for greedy selection. Only works
                with builtin predictors.
            space_file (str): The fully specified path of a pickle file
                containing a list of all possible searchable vectors.
                For example '/Users/myuser/myfolder/myspace.p'. When loaded,
                this space_file should be a list of tuples.
            onehot_categorical (bool): If True, preprocesses categorical data
                (strings) to one-hot encoded binary arrays for use with custom
                predictor functions. Default False.
            duplicate_check (bool): If True, checks that custom optimizers are
                not making duplicate guesses; all built-in optimizers cannot
                duplicate guess. If the custom predictor suggests a duplicate,
                OptTask picks a random guess out of the remaining untried space.
                Default is no duplicate check, and an error is raised if a
                duplicate is suggested.
            tolerances (list): The tolerance of each feature when duplicate
                checking. For categorical features, put 'None'
                Example: Our dimensions are [(1, 100), ['red', 'blue'],
                (2.0, 20.0)]. We want our first parameter to be  a duplicate
                only if it is exact, and our third parameter to be a duplicate
                if it is within 1e-6. Then:
                    tolerances=[0, None, 1e-6]
            maximize (bool): If True, maximizes the objective function instead
                of minimizing. Defaults to False, meaninng minimze.

            z-vector features:
            get_z (string): the fully-qualified name of a function which, given
                an x vector, returns another vector z which provides extra
                information to the machine learner. The features defined in z
                are not used to run the workflow, but are used for learning. If
                z_features are enabled, ONLY z features will be used for
                learning (x vectors essentially become tags or identifiers only)
                Examples:
                    get_z = 'my_pkg.my_module.my_fun'
                    get_z = '/path/to/folder/containing/my_dir/my_module.my_fun'
            get_z_args (list): the positional arguments to be passed to the
                get_z function alongside x
            get_z_kwargs (dict): the kwargs to be passed to the get_z function
                alongside x
            z_file (str): The filename (pickle file) where OptTask should save
                /cache z calculations. Specify this argument if calculating z
                for many (n_search_pts) is not trivial and will cost time in
                computing. With this argument specified, each z  will only be
                calculated once. Defaults to None, meaning that all unexplored z
                are re-calculated each iteration.
                Example:
                    z_file = '/path/to/z_guesses.p'

            Parallelism:
            enforce_sequential (bool): WARNING: Experimental feature! If True,
                enforces that RS optimizations are run sequentially (default),
                which prevents duplicate guesses from ever being run. If False,
                allows OptTasks to run optimizations in parallel, which may
                cause duplicate guesses with high parallelism.
            batch_size (int): The number of jobs to submit per batch for a batch
                optimization. For example, batch_size=5 will optimize every 5th
                job, then submit another 5 jobs based on the best 5 predictions
                (recomputing the acquisition function after each prediction).
            timeout (int): The number of seconds to wait before resetting the
                lock on the db.

        Returns:
            None: If you want to run the OptTask workflow, you'll need to pass
            in the launchpad and opt_label arguments in your wf_creator.
        """
        config = get_default_opttask_kwargs()
        config["launchpad"] = self.launchpad.to_db_dict()
        config["opt_label"] = self.opt_label
        for kw in kwargs.keys():
            if kw not in config:
                raise KeyError(
                    "{} not a valid argument for setup_config. Choose "
                    "from: {}".format(kw, list(config.keys())))
            elif kw in ["get_z", "predictor"]:
                if hasattr(kwargs[kw], '__call__'):
                    config[kw] = serialize(kwargs[kw])
            else:
                config[kw] = kwargs[kw]
        if hasattr(wf_creator, '__call__'):
            wf_creator = serialize(wf_creator)
        config["wf_creator"] = wf_creator
        config["dimensions"] = dimensions

        # Determine data types of dimensions
        config["dim_types"] = check_dims(dimensions)
        config["is_discrete_any"] = is_discrete(dimensions, criteria="any")
        config["is_discrete_all"] = is_discrete(dimensions, criteria="all")

        # Ensure importable functions are importable
        try:
            deserialize(wf_creator)
        except ImportError as IE:
            self.logger.warn("wf_creator " + IMPORT_WARNING + "\n" + str(IE))
        try:
            pre = config["predictor"]
            if pre:
                if "." in pre:
                    deserialize(pre)
        except ImportError as IE:
            self.logger.warn("predictor " + IMPORT_WARNING + "\n" + str(IE))
        try:
            getz = config["get_z"]
            if getz:
                if "." in getz:
                    deserialize(getz)
        except ImportError as IE:
            self.logger.warn("get_z " + IMPORT_WARNING + "\n" + str(IE))

        # Ensure acquisition function is valid (for builtin predictors)
        acq_funcs = [None, 'ei', 'pi', 'lcb', 'maximin']
        if config['acq'] not in acq_funcs:
            raise ValueError(
                "Invalid acquisition function. Use 'ei', 'pi', 'lcb', "
                "'maximin' (multiobjective), or None.")
        config["doctype"] = "config"
        self.config = config
        if self.c.find_one({"doctype": "config"}):
            raise ValueError("A config is already present in this Launchpad "
                             "for opt_label=={}. Please use the MissionControl"
                             " reset method to reset the database config."
                             "".format(self.opt_label))
        else:
            self.c.insert_one(self.config)
            self.logger.info("Rocketsled configuration succeeded.")
        self.is_configured = True
예제 #9
0
 def test_deserialize(self):
     cwd = os.path.dirname(os.path.realpath(__file__))
     funcstr = cwd + "/deserialize_func.obj_func"
     f = deserialize(funcstr)
     self.assertEqual(f([1, 2, 3]), 6)
     self.assertAlmostEqual(f([1.0, 2.0, 3.0]), 6.0)