Exemplo n.º 1
0
def generate_boolean_data_with_complete_test_sets(a, d, n, g, tg, noise,
        random_seed=1, n_learning_sets=1, funcs_pickle_path=None):
    """Generate a synthetic MTL problem of learning Boolean functions according
    to the given parameters. In addition, create test sets that cover the
    complete attribute space (2**a distinct examples).
    Log the report about the generated MTL problem, which includes:
    - the Boolean function of each group,
    - the % of True values in y for each task,
    - the average % of True values in y (across all tasks).
    
    Parameters
    ----------
    a : int
        Number of attributes/variables of the generated Boolean functions.
    d : int
        The expected number of attributes/variables in a disjunct.
    n : int
        The number of examples for each task to generate.
    g : int
        The number of task groups to generate. Each task group shares the
        same Boolean functions.
    tg : int
        The number of tasks (with their corresponding data) to generate for
        each task group.
    noise : float
        The proportion of examples of each task that have their class values
        determined randomly.
    random_seed : int (optional)
        The random seed with which to initialize a private Random object.
    n_learning_sets : int (optional)
        The number of different learning sets to create for each task.
    funcs_pickle_path : str (optional)
        Path where to pickle the list of generated Boolean functions. 
    
    Returns
    -------
    tasks : list
        If n_learning_sets == 1, a list of Bunch objects corresponding to
        Boolean function learning tasks.
        Otherwise, a list of lists of Bunch objects, where each list corresponds
        to a set of different learning sets for each task.
    tasks_complete_test_sets : list
        A list of (X, y) tuples corresponding to complete testing sets for each
        task.
    
    """
    tasks, funcs, attr = _generate_boolean_data(a, d, n, g, tg, noise,
                            random_seed, n_learning_sets=n_learning_sets)
    if funcs_pickle_path:
        pickle_obj(funcs, funcs_pickle_path)
    
    tasks_complete_test_sets = []
    # generate a complete testing set for each Boolean function
    n_funcs = len(funcs)
    print ("Generating the complete test sets for {} Boolean functions".
           format(n_funcs))
    for i, func in enumerate(funcs):
        complete_test_set = _generate_complete_test_set(attr, func)
        # duplicate the generated complete testing set for each task from the
        # current task group 
        for _ in range(tg):
            tasks_complete_test_sets.append(complete_test_set)
        update_progress(1.* (i + 1) / n_funcs)
    print
    
    _report_about_generated_boolean_mtl_problem(funcs, tasks)
    return tasks, tasks_complete_test_sets
Exemplo n.º 2
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Perform the
     intelligent merging of tasks' data according to the ERM learning method.
     After the merging is complete, build a model for each remaining (merged)
     task and assign this model to each original task of this (merged) task.
     Return a dictionary of data structures computed within this call to ERM.
     It has the following keys:
         task_models -- dictionary mapping from each original task id to its
             model
         dend_info -- list of tuples (one for each merged task) as returned
             by the convert_merg_history_to_scipy_linkage function
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     self._base_learner = base_learner
     # create an ordered dictionary of MergedTask objects from the given
     # dictionary of tasks
     self._tasks = OrderedDict()
     for _, task in sorted(tasks.iteritems()):
         merg_task = MergedTask(task)
         self._tasks[merg_task.id] = merg_task
     # populate the dictionary of task pairs that are candidates for merging
     C = dict()
     pairs = list(combinations(self._tasks, 2))
     n_pairs = len(pairs)
     msg = "Computing candidate pairs for merging ({} pairs)".format(
         n_pairs)
     logger.debug(msg)
     print msg
     for i, (tid_i, tid_j) in enumerate(pairs):
         if self._prefilter(tid_i, tid_j):
             avg_pred_errs, p_values_ij = \
                 self._estimate_errors_significances(tid_i, tid_j)
             er_ij = error_reduction(avg_pred_errs["data1"]["data1"],
                                     avg_pred_errs["data2"]["data2"],
                                     avg_pred_errs["dataM"]["dataM"],
                                     self._tasks[tid_i].get_data_size(),
                                     self._tasks[tid_j].get_data_size())
             min_ij = min(avg_pred_errs["data1"]["dataM"],
                          avg_pred_errs["data2"]["dataM"])
             if er_ij >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_ij:
                 cp = CandidatePair(tid_i, tid_j, p_values_ij)
                 C[cp.key] = cp
         update_progress(1. * (i + 1) / n_pairs)
     print
     # iteratively merge the most similar pair of tasks, until such pairs
     # exist
     n_cand = len(C)
     msg = "Processing {} candidate pairs for merging".format(n_cand)
     logger.debug(msg)
     print msg
     while len(C) > 0:
         # find the task pair with the minimal maximal p-value
         maxes = [(cp_key, cp.get_max_p_value())
                  for cp_key, cp in C.iteritems()]
         (min_tid_i, min_tid_j), _ = min(maxes, key=lambda x: x[1])
         # merge the pair of tasks and update self._tasks
         task_M = MergedTask(self._tasks[min_tid_i], self._tasks[min_tid_j])
         tid_M = task_M.id
         del self._tasks[min_tid_i]
         del self._tasks[min_tid_j]
         self._tasks[tid_M] = task_M
         # remove task pairs that don't exist anymore from C
         for (tid_i, tid_j) in C.keys():
             if ((tid_i == min_tid_i) or (tid_i == min_tid_j)
                     or (tid_j == min_tid_i) or (tid_j == min_tid_j)):
                 del C[(tid_i, tid_j)]
         # find new task pairs that are candidates for merging
         for tid_i in self._tasks:
             if tid_i != tid_M and self._prefilter(tid_i, tid_M):
                 avg_pred_errs, p_values_iM = \
                     self._estimate_errors_significances(tid_i, tid_M)
                 er_iM = error_reduction(avg_pred_errs["data1"]["data1"],
                                         avg_pred_errs["data2"]["data2"],
                                         avg_pred_errs["dataM"]["dataM"],
                                         self._tasks[tid_i].get_data_size(),
                                         self._tasks[tid_M].get_data_size())
                 min_iM = min(avg_pred_errs["data1"]["dataM"],
                              avg_pred_errs["data2"]["dataM"])
                 if er_iM >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_iM:
                     cp = CandidatePair(tid_i, tid_M, p_values_iM)
                     C[cp.key] = cp
         update_progress(1. * len(C) / n_cand, invert=True)
     print
     # build a model for each remaining (merged) task and store the info
     # for drawing a dendrogram showing the merging history
     task_models = dict()
     dend_info = []
     for merg_task in self._tasks.itervalues():
         # NOTE: When the number of unique class values is less than 2, we
         # cannot fit an ordinary model (e.g. logistic regression). Instead,
         # we have to use a dummy classifier which is subsequently augmented
         # to handle all the other class values.
         # NOTE: The scikit-learn estimator must be cloned so that each
         # (merged) task gets its own classifier
         X, y = merg_task.get_learn_data()
         if len(np.unique(y)) < 2:
             logger.info("Learning data for merged task {} has less than 2 "
                         "class values. Using DummyClassifier.".\
                         format(merg_task))
             model = DummyClassifier()
             model.fit(X, y)
             change_dummy_classes(model, np.array([0, 1]))
         else:
             model = clone(self._base_learner)
             model.fit(X, y)
         # assign this model to each original task of this (merged) task
         original_ids = merg_task.get_original_ids()
         for tid in original_ids:
             task_models[tid] = model
         # store the dendrogram info (if the task is truly a merged task)
         if len(original_ids) > 1:
             dend_info.append(
                 convert_merg_history_to_scipy_linkage(
                     merg_task.merg_history))
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     R["dend_info"] = dend_info
     return R
Exemplo n.º 3
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Perform the
     intelligent merging of tasks' data according to the ERM learning method.
     After the merging is complete, build a model for each remaining (merged)
     task and assign this model to each original task of this (merged) task.
     Return a dictionary of data structures computed within this call to ERM.
     It has the following keys:
         task_models -- dictionary mapping from each original task id to its
             model
         dend_info -- list of tuples (one for each merged task) as returned
             by the convert_merg_history_to_scipy_linkage function
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     self._base_learner = base_learner
     # create an ordered dictionary of MergedTask objects from the given
     # dictionary of tasks
     self._tasks = OrderedDict()
     for _, task in sorted(tasks.iteritems()):
         merg_task = MergedTask(task)
         self._tasks[merg_task.id] = merg_task
     # populate the dictionary of task pairs that are candidates for merging
     C = dict()
     pairs = list(combinations(self._tasks, 2))
     n_pairs = len(pairs)
     msg = "Computing candidate pairs for merging ({} pairs)".format(n_pairs)
     logger.debug(msg)
     print msg
     for i, (tid_i, tid_j) in enumerate(pairs):
         if self._prefilter(tid_i, tid_j):
             avg_pred_errs, p_values_ij = \
                 self._estimate_errors_significances(tid_i, tid_j)
             er_ij = error_reduction(avg_pred_errs["data1"]["data1"],
                                     avg_pred_errs["data2"]["data2"],
                                     avg_pred_errs["dataM"]["dataM"],
                                     self._tasks[tid_i].get_data_size(),
                                     self._tasks[tid_j].get_data_size())
             min_ij = min(avg_pred_errs["data1"]["dataM"],
                          avg_pred_errs["data2"]["dataM"])
             if  er_ij >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_ij:
                 cp = CandidatePair(tid_i, tid_j, p_values_ij)
                 C[cp.key] = cp
         update_progress(1.* (i + 1) / n_pairs)
     print
     # iteratively merge the most similar pair of tasks, until such pairs
     # exist
     n_cand = len(C)
     msg = "Processing {} candidate pairs for merging".format(n_cand)
     logger.debug(msg)
     print msg
     while len(C) > 0:
         # find the task pair with the minimal maximal p-value
         maxes = [(cp_key, cp.get_max_p_value()) for cp_key, cp in
                  C.iteritems()]
         (min_tid_i, min_tid_j), _ = min(maxes, key=lambda x: x[1])
         # merge the pair of tasks and update self._tasks
         task_M = MergedTask(self._tasks[min_tid_i], self._tasks[min_tid_j])
         tid_M = task_M.id
         del self._tasks[min_tid_i]
         del self._tasks[min_tid_j]
         self._tasks[tid_M] = task_M
         # remove task pairs that don't exist anymore from C
         for (tid_i, tid_j) in C.keys():
             if ((tid_i == min_tid_i) or (tid_i == min_tid_j) or
                 (tid_j == min_tid_i) or (tid_j == min_tid_j)):
                 del C[(tid_i, tid_j)]
         # find new task pairs that are candidates for merging
         for tid_i in self._tasks:
             if tid_i != tid_M and self._prefilter(tid_i, tid_M):
                 avg_pred_errs, p_values_iM = \
                     self._estimate_errors_significances(tid_i, tid_M)
                 er_iM = error_reduction(avg_pred_errs["data1"]["data1"],
                                         avg_pred_errs["data2"]["data2"],
                                         avg_pred_errs["dataM"]["dataM"],
                                         self._tasks[tid_i].get_data_size(),
                                         self._tasks[tid_M].get_data_size())
                 min_iM = min(avg_pred_errs["data1"]["dataM"],
                              avg_pred_errs["data2"]["dataM"])
                 if er_iM >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_iM:
                     cp = CandidatePair(tid_i, tid_M, p_values_iM)
                     C[cp.key] = cp
         update_progress(1.* len(C) / n_cand, invert=True)
     print
     # build a model for each remaining (merged) task and store the info
     # for drawing a dendrogram showing the merging history
     task_models = dict()
     dend_info = []
     for merg_task in self._tasks.itervalues():
         # NOTE: When the number of unique class values is less than 2, we
         # cannot fit an ordinary model (e.g. logistic regression). Instead,
         # we have to use a dummy classifier which is subsequently augmented
         # to handle all the other class values.
         # NOTE: The scikit-learn estimator must be cloned so that each
         # (merged) task gets its own classifier
         X, y = merg_task.get_learn_data()
         if len(np.unique(y)) < 2:
             logger.info("Learning data for merged task {} has less than 2 "
                         "class values. Using DummyClassifier.".\
                         format(merg_task))
             model = DummyClassifier()
             model.fit(X, y)
             change_dummy_classes(model, np.array([0, 1]))
         else:
             model = clone(self._base_learner)
             model.fit(X, y)
         # assign this model to each original task of this (merged) task
         original_ids = merg_task.get_original_ids()
         for tid in original_ids:
             task_models[tid] = model
         # store the dendrogram info (if the task is truly a merged task)
         if len(original_ids) > 1:
             dend_info.append(convert_merg_history_to_scipy_linkage(
                                 merg_task.merg_history))
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     R["dend_info"] = dend_info
     return R
Exemplo n.º 4
0
def generate_boolean_data_with_complete_test_sets(a,
                                                  d,
                                                  n,
                                                  g,
                                                  tg,
                                                  noise,
                                                  random_seed=1,
                                                  n_learning_sets=1,
                                                  funcs_pickle_path=None):
    """Generate a synthetic MTL problem of learning Boolean functions according
    to the given parameters. In addition, create test sets that cover the
    complete attribute space (2**a distinct examples).
    Log the report about the generated MTL problem, which includes:
    - the Boolean function of each group,
    - the % of True values in y for each task,
    - the average % of True values in y (across all tasks).
    
    Parameters
    ----------
    a : int
        Number of attributes/variables of the generated Boolean functions.
    d : int
        The expected number of attributes/variables in a disjunct.
    n : int
        The number of examples for each task to generate.
    g : int
        The number of task groups to generate. Each task group shares the
        same Boolean functions.
    tg : int
        The number of tasks (with their corresponding data) to generate for
        each task group.
    noise : float
        The proportion of examples of each task that have their class values
        determined randomly.
    random_seed : int (optional)
        The random seed with which to initialize a private Random object.
    n_learning_sets : int (optional)
        The number of different learning sets to create for each task.
    funcs_pickle_path : str (optional)
        Path where to pickle the list of generated Boolean functions. 
    
    Returns
    -------
    tasks : list
        If n_learning_sets == 1, a list of Bunch objects corresponding to
        Boolean function learning tasks.
        Otherwise, a list of lists of Bunch objects, where each list corresponds
        to a set of different learning sets for each task.
    tasks_complete_test_sets : list
        A list of (X, y) tuples corresponding to complete testing sets for each
        task.
    
    """
    tasks, funcs, attr = _generate_boolean_data(
        a, d, n, g, tg, noise, random_seed, n_learning_sets=n_learning_sets)
    if funcs_pickle_path:
        pickle_obj(funcs, funcs_pickle_path)

    tasks_complete_test_sets = []
    # generate a complete testing set for each Boolean function
    n_funcs = len(funcs)
    print("Generating the complete test sets for {} Boolean functions".format(
        n_funcs))
    for i, func in enumerate(funcs):
        complete_test_set = _generate_complete_test_set(attr, func)
        # duplicate the generated complete testing set for each task from the
        # current task group
        for _ in range(tg):
            tasks_complete_test_sets.append(complete_test_set)
        update_progress(1. * (i + 1) / n_funcs)
    print

    _report_about_generated_boolean_mtl_problem(funcs, tasks)
    return tasks, tasks_complete_test_sets