示例#1
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Learn a single model
     on the merger of all tasks' data using the given base learner.
     Return a dictionary of data structures computed within this learner.
     It has the following keys:
         task_models -- dictionary mapping from tasks' ids to the learned
             models (in this case, all tasks' ids will map to the same model)
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     # merge learning data of all tasks
     Xs_ys = [t.get_learn_data() for t in tasks.itervalues()]
     Xs, ys = zip(*Xs_ys)
     merged_data = np.concatenate(Xs, axis=0), np.concatenate(ys, axis=0)
     logger.debug("Merged data has {0[1]} attributes and {0[0]} examples.".\
                  format(merged_data[0].shape))
     # NOTE: The scikit-learn estimator must be cloned to prevent different
     # tasks from having the same classifiers
     model = clone(base_learner)
     model.fit(*merged_data)
     # assign the fitted model to all tasks
     task_models = dict()
     for tid in tasks:
         task_models[tid] = model
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     return R
示例#2
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Learn a single model
     on the merger of all tasks' data using the given base learner.
     Return a dictionary of data structures computed within this learner.
     It has the following keys:
         task_models -- dictionary mapping from tasks' ids to the learned
             models (in this case, all tasks' ids will map to the same model)
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     # merge learning data of all tasks
     Xs_ys = [t.get_learn_data() for t in tasks.itervalues()]
     Xs, ys = zip(*Xs_ys)
     merged_data = np.concatenate(Xs, axis=0), np.concatenate(ys, axis=0)
     logger.debug("Merged data has {0[1]} attributes and {0[0]} examples.".\
                  format(merged_data[0].shape))
     # NOTE: The scikit-learn estimator must be cloned to prevent different
     # tasks from having the same classifiers
     model = clone(base_learner)
     model.fit(*merged_data)
     # assign the fitted model to all tasks
     task_models = dict()
     for tid in tasks:
         task_models[tid] = model
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     return R
示例#3
0
 def __call__(self, task_ids, merged_data, base_learner):
     """Check that the given base learner is an Orange TreeLearner and then
     transform it into a ForcedFirstSplitTreeLearner. Use it on the merged
     data to build a common model for all tasks.
     Assign the fitted model to all tasks.
     Return a dictionary of data structures computed within this learner.
     It has the following keys:
         task_models -- dictionary mapping from tasks' ids to the learned
             models (in this case, all tasks' ids will map to the same model)
     
     Arguments:
     task_ids -- list of tasks' ids
     merged_data -- Orange.data.Table representing the merged learning data
         of all tasks
     base_learner -- Orange.classification.tree.TreeLearner representing the
         base learner to build the models
     
     """
     # check that the given base learner is an Orange's TreeLearner
     if not isinstance(base_learner,
                       Orange.classification.tree.TreeLearner):
         raise ValueError("The base_learner should be an Orange "
                          "TreeLearner.")
     # create an instance of the ForcedFirstSplitTreeLearner with the same
     # attributes as the given base_learner
     ffstl = ForcedFirstSplitTreeLearner(
         first_split_attr=self.first_split_attr)
     for k, v in base_learner.__dict__.items():
         try:
             ffstl.__dict__[k] = v
         except:
             logger.debug("Could not set the value of attribute: {}".\
                          format(k))
     # build a model on the merged data
     model = ffstl(merged_data)
     # assign the fitted model to all tasks
     task_models = dict()
     for tid in task_ids:
         task_models[tid] = model
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     return R
示例#4
0
文件: bin_exp.py 项目: marinkaz/PyMTL
 def __call__(self, task_ids, merged_data, base_learner):
     """Check that the given base learner is an Orange TreeLearner and then
     transform it into a ForcedFirstSplitTreeLearner. Use it on the merged
     data to build a common model for all tasks.
     Assign the fitted model to all tasks.
     Return a dictionary of data structures computed within this learner.
     It has the following keys:
         task_models -- dictionary mapping from tasks' ids to the learned
             models (in this case, all tasks' ids will map to the same model)
     
     Arguments:
     task_ids -- list of tasks' ids
     merged_data -- Orange.data.Table representing the merged learning data
         of all tasks
     base_learner -- Orange.classification.tree.TreeLearner representing the
         base learner to build the models
     
     """
     # check that the given base learner is an Orange's TreeLearner
     if not isinstance(base_learner, Orange.classification.tree.TreeLearner):
         raise ValueError("The base_learner should be an Orange "
                          "TreeLearner.")
     # create an instance of the ForcedFirstSplitTreeLearner with the same
     # attributes as the given base_learner
     ffstl = ForcedFirstSplitTreeLearner(first_split_attr=
                                         self.first_split_attr)
     for k, v in base_learner.__dict__.items():
         try:
             ffstl.__dict__[k] = v
         except:
             logger.debug("Could not set the value of attribute: {}".\
                          format(k))
     # build a model on the merged data
     model = ffstl(merged_data)
     # assign the fitted model to all tasks
     task_models = dict()
     for tid in task_ids:
         task_models[tid] = model
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     return R
示例#5
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Learn a model using
     the given base learner for each task on its own data (no merging).
     Return a dictionary of data structures computed within this learner.
     It has the following keys:
         task_models -- dictionary mapping from tasks' ids to the learned
             models
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     task_models = dict()
     for tid, task in tasks.iteritems():
         # NOTE: When the number of unique class values is less than 2, we
         # cannot fit an ordinary model (e.g. logistic regression). Instead,
         # we have to use a dummy classifier which is subsequently augmented
         # to handle all the other class values.
         # NOTE: The scikit-learn estimator must be cloned so that each data
         # set gets its own classifier
         learn = task.get_learn_data()
         if len(np.unique(learn[1])) < 2:
             logger.debug("Learning data for task {} has less than 2 class "
                          "values. Using DummyClassifier.".format(tid))
             model = DummyClassifier()
             model.fit(*learn)
             change_dummy_classes(model, np.array([0, 1]))
         else:
             model = clone(base_learner)
             model.fit(*learn)
         task_models[tid] = model
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     return R
示例#6
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Learn a model using
     the given base learner for each task on its own data (no merging).
     Return a dictionary of data structures computed within this learner.
     It has the following keys:
         task_models -- dictionary mapping from tasks' ids to the learned
             models
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     task_models = dict()
     for tid, task in tasks.iteritems():
         # NOTE: When the number of unique class values is less than 2, we
         # cannot fit an ordinary model (e.g. logistic regression). Instead,
         # we have to use a dummy classifier which is subsequently augmented
         # to handle all the other class values.
         # NOTE: The scikit-learn estimator must be cloned so that each data
         # set gets its own classifier
         learn = task.get_learn_data()
         if len(np.unique(learn[1])) < 2:
             logger.debug("Learning data for task {} has less than 2 class "
                          "values. Using DummyClassifier.".format(tid))
             model = DummyClassifier()
             model.fit(*learn)
             change_dummy_classes(model, np.array([0, 1]))
         else:
             model = clone(base_learner)
             model.fit(*learn)
         task_models[tid] = model
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     return R
 def test_tasks(self, learners, base_learners, measures, results_path,
                save_orange_data=False):
     """Repeat the following experiment self._repeats times:
     Prepare tasks' data with the _prepare_tasks_data() function.
     Test the performance of the given learning algorithms with the given
     base learning algorithms and compute the testing results using the
     given scoring measures.
     Process the obtained repetition scores with the
     _process_repetition_scores() function.
     Note: This function only test some specific combinations of
     base_learners and learners as used by the binarization experiment.
     
     Arguments:
     learners -- ordered dictionary with items of the form (name, learner),
         where name is a string representing the learner's name and
         learner is a MTL method (e.g. ERM, NoMerging, ...) 
     base learners -- ordered dictionary with items of the form (name,
         learner), where name is a string representing the base learner's
         name and learner is a scikit-learn estimator object
     measures -- list of strings representing measure's names (currently,
         only CA and AUC are supported)
     results_path -- string representing the path where to save any extra
         information about the running of this test (currently, only used
         for pickling the results when there is an error in calling the
         learner)
     save_orange_data -- boolean indicating whether to save the Orange data
         tables created with the call to self._prepare_tasks_data() function
     
     """
     rpt_scores = OrderedDict()
     dend_info = {bl : OrderedDict() for bl in base_learners.iterkeys()}
     for i in range(self._repeats):
         self._repetition_number = i
         self._prepare_tasks_data(**self._tasks_data_params)
         if save_orange_data:
             self._save_orange_data(i, results_path)
         rpt_scores[i] = {bl : dict() for bl in base_learners.iterkeys()}
         for bl in base_learners:
             for l in learners:
                 start = time.clock()
                 try: 
                     if isinstance(learners[l],
                                   bin_exp.TreeMarkedAndMergedLearner):
                         R = learners[l](self._tasks.keys(),
                                         self._merged_learn_data_orange,
                                         base_learners[bl])
                     elif isinstance(base_learners[bl], Orange.core.Learner):
                         wrapped_bl = OrangeClassifierWrapper(
                                         orange_learner=base_learners[bl])
                         R = learners[l](self._tasks, wrapped_bl)
                     else:
                         raise ValueError("An unexpected combination of "
                                 "base_learner and leaner detected: {} and "
                                 "{}".format(type(base_learners[bl]),
                                             type(learners[l])))
                 except Exception as e:
                     logger.exception("There was an error during repetition:"
                         " {} with base learner: {} and learner: {}.".\
                         format(i, bl, l))
                     if i > 0:
                         logger.info("Saving the results of previous "
                                     "repetitions.")
                         # remove the scores of the last repetition
                         del rpt_scores[i]
                         # process the remaining repetition scores
                         self._process_repetition_scores(rpt_scores,
                                                         dend_info)
                         # pickle them to a file
                         pickle_path_fmt = os.path.join(results_path,
                                                        "bl-{}.pkl")
                         self.pickle_test_results(pickle_path_fmt)
                     # re-raise the original exception
                     import sys
                     exc_info = sys.exc_info()
                     raise exc_info[1], None, exc_info[2]
                 rpt_scores[i][bl][l] = self._test_tasks(R["task_models"],
                                                         measures)
                 end = time.clock()
                 logger.debug("Finished repetition: {}, base learner: {}, "
                     "learner: {} in {:.2f}s".format(i, bl, l, end-start))
                 # store dendrogram info if the results contain it 
                 if "dend_info" in R:
                     dend_info[bl][i] = R["dend_info"]
                 # pickle and visualize the decision tree if the learner is a
                 # (sub)class of TreeMarkedAndMergedLearner
                 if isinstance(learners[l],
                               bin_exp.TreeMarkedAndMergedLearner):
                     tree = R["task_models"].values()[0]
                     pickle_path = os.path.join(results_path, "{}-{}-"
                                     "repeat{}.pkl".format(bl, l, i))
                     svg_path = os.path.join(results_path, "{}-{}-repeat{}"
                                             ".svg".format(bl, l, i))
                     tikz_path = os.path.join(results_path, "{}-{}-repeat{}"
                                              "-tikz.tex".format(bl, l, i))
                     pickle_obj(tree, pickle_path)
                     save_treegraph_image(tree, svg_path)
                     draw_and_save_tikz_tree_document(tree, tikz_path)
     self._process_repetition_scores(rpt_scores, dend_info)
示例#8
0
def _report_about_generated_boolean_mtl_problem(functions, tasks):
    """Log a report about the generated synthetic Boolean MTL problem
    represented by the given functions and tasks.
    Note: The logger object must be a valid Logger.
    
    Parameters
    ----------
    functions : list
        A list of Boolean functions comprised of Boolean operators from
        sympy.logic, one function for each task group.
    tasks : list
        Either a list of Bunch objects corresponding to Boolean function
        learning tasks,
        or a list of lists of Bunch objects, where each list corresponds
        to a set of different learning sets for each task.
    
    """
    # extract group names from tasks' ids
    group_names = []
    for tl in tasks:
        if isinstance(tl, list):
            t = tl[0]
        else:
            t = tl
        match = re.search(r"(Group \d+)", t.ID)
        group_name = match.group(1)
        if group_name not in group_names:
            group_names.append(group_name)
    if len(group_names) != len(functions):
        raise ValueError("The number of task groups doesn't correspond to the "
                         "number of Boolean functions.")
    
    logger.debug("Report about the generated synthetic Boolean MTL problem:")
    logger.debug("  Boolean function of each group:")
    for group_name, func in zip(group_names, functions):
        # NOTE: sympy's pretty() function returns a unicode string, so the
        # string literal must also be a unicode string
        logger.debug(u"   - {}: {}".format(group_name, pretty(func,
                                                              wrap_line=False)))
    logger.debug("  % of True values in y for each task:")
    sum_true = 0
    sum_total = 0
    for tl in tasks:
        if isinstance(tl, list):
            for i, t in enumerate(tl):
                cur_true = sum(t.target == True)
                cur_len = len(t.target)
                sum_true += cur_true
                sum_total += cur_len
                logger.debug("   - {} (learning set #{}): {}".\
                             format(t.ID, i, cur_true / cur_len))
        else:
            t = tl
            cur_true = sum(t.target == True)
            cur_len = len(t.target)
            sum_true += cur_true
            sum_total += cur_len
            logger.debug("   - {}: {}".format(t.ID, cur_true / cur_len))
    logger.debug("  Average % of True values in y (across all tasks): {}".\
                 format(sum_true / sum_total))
 def test_tasks(self,
                learners,
                base_learners,
                measures,
                results_path,
                save_orange_data=False):
     """Repeat the following experiment self._repeats times:
     Prepare tasks' data with the _prepare_tasks_data() function.
     Test the performance of the given learning algorithms with the given
     base learning algorithms and compute the testing results using the
     given scoring measures.
     Process the obtained repetition scores with the
     _process_repetition_scores() function.
     Note: This function only test some specific combinations of
     base_learners and learners as used by the binarization experiment.
     
     Arguments:
     learners -- ordered dictionary with items of the form (name, learner),
         where name is a string representing the learner's name and
         learner is a MTL method (e.g. ERM, NoMerging, ...) 
     base learners -- ordered dictionary with items of the form (name,
         learner), where name is a string representing the base learner's
         name and learner is a scikit-learn estimator object
     measures -- list of strings representing measure's names (currently,
         only CA and AUC are supported)
     results_path -- string representing the path where to save any extra
         information about the running of this test (currently, only used
         for pickling the results when there is an error in calling the
         learner)
     save_orange_data -- boolean indicating whether to save the Orange data
         tables created with the call to self._prepare_tasks_data() function
     
     """
     rpt_scores = OrderedDict()
     dend_info = {bl: OrderedDict() for bl in base_learners.iterkeys()}
     for i in range(self._repeats):
         self._repetition_number = i
         self._prepare_tasks_data(**self._tasks_data_params)
         if save_orange_data:
             self._save_orange_data(i, results_path)
         rpt_scores[i] = {bl: dict() for bl in base_learners.iterkeys()}
         for bl in base_learners:
             for l in learners:
                 start = time.clock()
                 try:
                     if isinstance(learners[l],
                                   bin_exp.TreeMarkedAndMergedLearner):
                         R = learners[l](self._tasks.keys(),
                                         self._merged_learn_data_orange,
                                         base_learners[bl])
                     elif isinstance(base_learners[bl],
                                     Orange.core.Learner):
                         wrapped_bl = OrangeClassifierWrapper(
                             orange_learner=base_learners[bl])
                         R = learners[l](self._tasks, wrapped_bl)
                     else:
                         raise ValueError(
                             "An unexpected combination of "
                             "base_learner and leaner detected: {} and "
                             "{}".format(type(base_learners[bl]),
                                         type(learners[l])))
                 except Exception as e:
                     logger.exception("There was an error during repetition:"
                         " {} with base learner: {} and learner: {}.".\
                         format(i, bl, l))
                     if i > 0:
                         logger.info("Saving the results of previous "
                                     "repetitions.")
                         # remove the scores of the last repetition
                         del rpt_scores[i]
                         # process the remaining repetition scores
                         self._process_repetition_scores(
                             rpt_scores, dend_info)
                         # pickle them to a file
                         pickle_path_fmt = os.path.join(
                             results_path, "bl-{}.pkl")
                         self.pickle_test_results(pickle_path_fmt)
                     # re-raise the original exception
                     import sys
                     exc_info = sys.exc_info()
                     raise exc_info[1], None, exc_info[2]
                 rpt_scores[i][bl][l] = self._test_tasks(
                     R["task_models"], measures)
                 end = time.clock()
                 logger.debug("Finished repetition: {}, base learner: {}, "
                              "learner: {} in {:.2f}s".format(
                                  i, bl, l, end - start))
                 # store dendrogram info if the results contain it
                 if "dend_info" in R:
                     dend_info[bl][i] = R["dend_info"]
                 # pickle and visualize the decision tree if the learner is a
                 # (sub)class of TreeMarkedAndMergedLearner
                 if isinstance(learners[l],
                               bin_exp.TreeMarkedAndMergedLearner):
                     tree = R["task_models"].values()[0]
                     pickle_path = os.path.join(
                         results_path, "{}-{}-"
                         "repeat{}.pkl".format(bl, l, i))
                     svg_path = os.path.join(
                         results_path, "{}-{}-repeat{}"
                         ".svg".format(bl, l, i))
                     tikz_path = os.path.join(
                         results_path, "{}-{}-repeat{}"
                         "-tikz.tex".format(bl, l, i))
                     pickle_obj(tree, pickle_path)
                     save_treegraph_image(tree, svg_path)
                     draw_and_save_tikz_tree_document(tree, tikz_path)
     self._process_repetition_scores(rpt_scores, dend_info)
示例#10
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Perform the
     intelligent merging of tasks' data according to the ERM learning method.
     After the merging is complete, build a model for each remaining (merged)
     task and assign this model to each original task of this (merged) task.
     Return a dictionary of data structures computed within this call to ERM.
     It has the following keys:
         task_models -- dictionary mapping from each original task id to its
             model
         dend_info -- list of tuples (one for each merged task) as returned
             by the convert_merg_history_to_scipy_linkage function
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     self._base_learner = base_learner
     # create an ordered dictionary of MergedTask objects from the given
     # dictionary of tasks
     self._tasks = OrderedDict()
     for _, task in sorted(tasks.iteritems()):
         merg_task = MergedTask(task)
         self._tasks[merg_task.id] = merg_task
     # populate the dictionary of task pairs that are candidates for merging
     C = dict()
     pairs = list(combinations(self._tasks, 2))
     n_pairs = len(pairs)
     msg = "Computing candidate pairs for merging ({} pairs)".format(n_pairs)
     logger.debug(msg)
     print msg
     for i, (tid_i, tid_j) in enumerate(pairs):
         if self._prefilter(tid_i, tid_j):
             avg_pred_errs, p_values_ij = \
                 self._estimate_errors_significances(tid_i, tid_j)
             er_ij = error_reduction(avg_pred_errs["data1"]["data1"],
                                     avg_pred_errs["data2"]["data2"],
                                     avg_pred_errs["dataM"]["dataM"],
                                     self._tasks[tid_i].get_data_size(),
                                     self._tasks[tid_j].get_data_size())
             min_ij = min(avg_pred_errs["data1"]["dataM"],
                          avg_pred_errs["data2"]["dataM"])
             if  er_ij >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_ij:
                 cp = CandidatePair(tid_i, tid_j, p_values_ij)
                 C[cp.key] = cp
         update_progress(1.* (i + 1) / n_pairs)
     print
     # iteratively merge the most similar pair of tasks, until such pairs
     # exist
     n_cand = len(C)
     msg = "Processing {} candidate pairs for merging".format(n_cand)
     logger.debug(msg)
     print msg
     while len(C) > 0:
         # find the task pair with the minimal maximal p-value
         maxes = [(cp_key, cp.get_max_p_value()) for cp_key, cp in
                  C.iteritems()]
         (min_tid_i, min_tid_j), _ = min(maxes, key=lambda x: x[1])
         # merge the pair of tasks and update self._tasks
         task_M = MergedTask(self._tasks[min_tid_i], self._tasks[min_tid_j])
         tid_M = task_M.id
         del self._tasks[min_tid_i]
         del self._tasks[min_tid_j]
         self._tasks[tid_M] = task_M
         # remove task pairs that don't exist anymore from C
         for (tid_i, tid_j) in C.keys():
             if ((tid_i == min_tid_i) or (tid_i == min_tid_j) or
                 (tid_j == min_tid_i) or (tid_j == min_tid_j)):
                 del C[(tid_i, tid_j)]
         # find new task pairs that are candidates for merging
         for tid_i in self._tasks:
             if tid_i != tid_M and self._prefilter(tid_i, tid_M):
                 avg_pred_errs, p_values_iM = \
                     self._estimate_errors_significances(tid_i, tid_M)
                 er_iM = error_reduction(avg_pred_errs["data1"]["data1"],
                                         avg_pred_errs["data2"]["data2"],
                                         avg_pred_errs["dataM"]["dataM"],
                                         self._tasks[tid_i].get_data_size(),
                                         self._tasks[tid_M].get_data_size())
                 min_iM = min(avg_pred_errs["data1"]["dataM"],
                              avg_pred_errs["data2"]["dataM"])
                 if er_iM >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_iM:
                     cp = CandidatePair(tid_i, tid_M, p_values_iM)
                     C[cp.key] = cp
         update_progress(1.* len(C) / n_cand, invert=True)
     print
     # build a model for each remaining (merged) task and store the info
     # for drawing a dendrogram showing the merging history
     task_models = dict()
     dend_info = []
     for merg_task in self._tasks.itervalues():
         # NOTE: When the number of unique class values is less than 2, we
         # cannot fit an ordinary model (e.g. logistic regression). Instead,
         # we have to use a dummy classifier which is subsequently augmented
         # to handle all the other class values.
         # NOTE: The scikit-learn estimator must be cloned so that each
         # (merged) task gets its own classifier
         X, y = merg_task.get_learn_data()
         if len(np.unique(y)) < 2:
             logger.info("Learning data for merged task {} has less than 2 "
                         "class values. Using DummyClassifier.".\
                         format(merg_task))
             model = DummyClassifier()
             model.fit(X, y)
             change_dummy_classes(model, np.array([0, 1]))
         else:
             model = clone(self._base_learner)
             model.fit(X, y)
         # assign this model to each original task of this (merged) task
         original_ids = merg_task.get_original_ids()
         for tid in original_ids:
             task_models[tid] = model
         # store the dendrogram info (if the task is truly a merged task)
         if len(original_ids) > 1:
             dend_info.append(convert_merg_history_to_scipy_linkage(
                                 merg_task.merg_history))
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     R["dend_info"] = dend_info
     return R
示例#11
0
 def __call__(self, tasks, base_learner):
     """Run the merging algorithm for the given tasks. Perform the
     intelligent merging of tasks' data according to the ERM learning method.
     After the merging is complete, build a model for each remaining (merged)
     task and assign this model to each original task of this (merged) task.
     Return a dictionary of data structures computed within this call to ERM.
     It has the following keys:
         task_models -- dictionary mapping from each original task id to its
             model
         dend_info -- list of tuples (one for each merged task) as returned
             by the convert_merg_history_to_scipy_linkage function
     
     Arguments:
     tasks -- dictionary mapping from tasks' ids to their Task objects
     base_learner -- scikit-learn estimator
     
     """
     self._base_learner = base_learner
     # create an ordered dictionary of MergedTask objects from the given
     # dictionary of tasks
     self._tasks = OrderedDict()
     for _, task in sorted(tasks.iteritems()):
         merg_task = MergedTask(task)
         self._tasks[merg_task.id] = merg_task
     # populate the dictionary of task pairs that are candidates for merging
     C = dict()
     pairs = list(combinations(self._tasks, 2))
     n_pairs = len(pairs)
     msg = "Computing candidate pairs for merging ({} pairs)".format(
         n_pairs)
     logger.debug(msg)
     print msg
     for i, (tid_i, tid_j) in enumerate(pairs):
         if self._prefilter(tid_i, tid_j):
             avg_pred_errs, p_values_ij = \
                 self._estimate_errors_significances(tid_i, tid_j)
             er_ij = error_reduction(avg_pred_errs["data1"]["data1"],
                                     avg_pred_errs["data2"]["data2"],
                                     avg_pred_errs["dataM"]["dataM"],
                                     self._tasks[tid_i].get_data_size(),
                                     self._tasks[tid_j].get_data_size())
             min_ij = min(avg_pred_errs["data1"]["dataM"],
                          avg_pred_errs["data2"]["dataM"])
             if er_ij >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_ij:
                 cp = CandidatePair(tid_i, tid_j, p_values_ij)
                 C[cp.key] = cp
         update_progress(1. * (i + 1) / n_pairs)
     print
     # iteratively merge the most similar pair of tasks, until such pairs
     # exist
     n_cand = len(C)
     msg = "Processing {} candidate pairs for merging".format(n_cand)
     logger.debug(msg)
     print msg
     while len(C) > 0:
         # find the task pair with the minimal maximal p-value
         maxes = [(cp_key, cp.get_max_p_value())
                  for cp_key, cp in C.iteritems()]
         (min_tid_i, min_tid_j), _ = min(maxes, key=lambda x: x[1])
         # merge the pair of tasks and update self._tasks
         task_M = MergedTask(self._tasks[min_tid_i], self._tasks[min_tid_j])
         tid_M = task_M.id
         del self._tasks[min_tid_i]
         del self._tasks[min_tid_j]
         self._tasks[tid_M] = task_M
         # remove task pairs that don't exist anymore from C
         for (tid_i, tid_j) in C.keys():
             if ((tid_i == min_tid_i) or (tid_i == min_tid_j)
                     or (tid_j == min_tid_i) or (tid_j == min_tid_j)):
                 del C[(tid_i, tid_j)]
         # find new task pairs that are candidates for merging
         for tid_i in self._tasks:
             if tid_i != tid_M and self._prefilter(tid_i, tid_M):
                 avg_pred_errs, p_values_iM = \
                     self._estimate_errors_significances(tid_i, tid_M)
                 er_iM = error_reduction(avg_pred_errs["data1"]["data1"],
                                         avg_pred_errs["data2"]["data2"],
                                         avg_pred_errs["dataM"]["dataM"],
                                         self._tasks[tid_i].get_data_size(),
                                         self._tasks[tid_M].get_data_size())
                 min_iM = min(avg_pred_errs["data1"]["dataM"],
                              avg_pred_errs["data2"]["dataM"])
                 if er_iM >= 0 and avg_pred_errs["dataM"]["dataM"] <= min_iM:
                     cp = CandidatePair(tid_i, tid_M, p_values_iM)
                     C[cp.key] = cp
         update_progress(1. * len(C) / n_cand, invert=True)
     print
     # build a model for each remaining (merged) task and store the info
     # for drawing a dendrogram showing the merging history
     task_models = dict()
     dend_info = []
     for merg_task in self._tasks.itervalues():
         # NOTE: When the number of unique class values is less than 2, we
         # cannot fit an ordinary model (e.g. logistic regression). Instead,
         # we have to use a dummy classifier which is subsequently augmented
         # to handle all the other class values.
         # NOTE: The scikit-learn estimator must be cloned so that each
         # (merged) task gets its own classifier
         X, y = merg_task.get_learn_data()
         if len(np.unique(y)) < 2:
             logger.info("Learning data for merged task {} has less than 2 "
                         "class values. Using DummyClassifier.".\
                         format(merg_task))
             model = DummyClassifier()
             model.fit(X, y)
             change_dummy_classes(model, np.array([0, 1]))
         else:
             model = clone(self._base_learner)
             model.fit(X, y)
         # assign this model to each original task of this (merged) task
         original_ids = merg_task.get_original_ids()
         for tid in original_ids:
             task_models[tid] = model
         # store the dendrogram info (if the task is truly a merged task)
         if len(original_ids) > 1:
             dend_info.append(
                 convert_merg_history_to_scipy_linkage(
                     merg_task.merg_history))
     # create and fill the return dictionary
     R = dict()
     R["task_models"] = task_models
     R["dend_info"] = dend_info
     return R
示例#12
0
def _report_about_generated_boolean_mtl_problem(functions, tasks):
    """Log a report about the generated synthetic Boolean MTL problem
    represented by the given functions and tasks.
    Note: The logger object must be a valid Logger.
    
    Parameters
    ----------
    functions : list
        A list of Boolean functions comprised of Boolean operators from
        sympy.logic, one function for each task group.
    tasks : list
        Either a list of Bunch objects corresponding to Boolean function
        learning tasks,
        or a list of lists of Bunch objects, where each list corresponds
        to a set of different learning sets for each task.
    
    """
    # extract group names from tasks' ids
    group_names = []
    for tl in tasks:
        if isinstance(tl, list):
            t = tl[0]
        else:
            t = tl
        match = re.search(r"(Group \d+)", t.ID)
        group_name = match.group(1)
        if group_name not in group_names:
            group_names.append(group_name)
    if len(group_names) != len(functions):
        raise ValueError("The number of task groups doesn't correspond to the "
                         "number of Boolean functions.")

    logger.debug("Report about the generated synthetic Boolean MTL problem:")
    logger.debug("  Boolean function of each group:")
    for group_name, func in zip(group_names, functions):
        # NOTE: sympy's pretty() function returns a unicode string, so the
        # string literal must also be a unicode string
        logger.debug(u"   - {}: {}".format(group_name,
                                           pretty(func, wrap_line=False)))
    logger.debug("  % of True values in y for each task:")
    sum_true = 0
    sum_total = 0
    for tl in tasks:
        if isinstance(tl, list):
            for i, t in enumerate(tl):
                cur_true = sum(t.target == True)
                cur_len = len(t.target)
                sum_true += cur_true
                sum_total += cur_len
                logger.debug("   - {} (learning set #{}): {}".\
                             format(t.ID, i, cur_true / cur_len))
        else:
            t = tl
            cur_true = sum(t.target == True)
            cur_len = len(t.target)
            sum_true += cur_true
            sum_total += cur_len
            logger.debug("   - {}: {}".format(t.ID, cur_true / cur_len))
    logger.debug("  Average % of True values in y (across all tasks): {}".\
                 format(sum_true / sum_total))