Пример #1
0
 def run(self):
     """
     Runs ER at all thresholds
     :return predicted_labels: List of lists of predicted labels.
                               predicted_labels[threshold_index] = dict [identifier, cluster label]
     :return metrics: List of lists of metric objects.
                      metrics[threshold_index] = Metrics object
     :return er_objects: List of EntityResolution objects.
                         er_objects[threshold_index] = EntityResolution
     :return new_metrics_objects: List of NewMetrics objects.
                                 new_metrics_objects[threshold_index] = NewMetrics
     """
     er = EntityResolution()
     #weak_match_function = LogisticMatchFunction(self._database_train, self._labels_train, self._train_pair_seed, 0.5)
     weak_match_function = ForestMatchFunction(self._database_train, self._labels_train, self._train_pair_seed, 0.5)
     print 'Testing pairwise match function on test database'
     ROC = weak_match_function.test(self._database_validation, self._labels_validation, self._validation_seed)
     #ROC.make_plot()
     metrics_list = list()
     labels_list = list()
     new_metrics_list = list()
     class_balance_test = count_pairwise_class_balance(self._labels_test)
     blocks = BlockingScheme(self._database_test, single_block=True)
     for threshold in self.thresholds:
         print 'Running entity resolution at threshold =', threshold
         weak_match_function.set_decision_threshold(threshold)
         labels_pred = weak_connected_components(self._database_test, weak_match_function, blocks)
         #labels_pred = er.run(self._database_test, weak_match_function, single_block=True, max_block_size=np.Inf,
         #                     cores=1)
         metrics_list.append(Metrics(self._labels_test, labels_pred))
         new_metrics_list.append(NewMetrics(self._database_test, labels_pred, weak_match_function, class_balance_test))
         labels_list.append(labels_pred)
     return labels_list, metrics_list, new_metrics_list