def main_loop(alibox, round, strategy): train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) # init model X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y, unknown_element=0) model.fit(X=X_tr, y=y_tr) ini_lab_num = len(label_ind) # A simple stopping criterion to specify the query budget. while len(label_ind) - ini_lab_num <= 120: # query and update if isinstance(strategy, QueryMultiLabelAUDI): # If you are using a label ranking model, pass it to AUDI. It can # avoid re-training a label ranking model inside the algorithm select_labs = strategy.select(label_ind, unlab_ind, model=model) else: select_labs = strategy.select(label_ind, unlab_ind) # use cost to record the amount of queried instance-label pairs if len(select_labs[0]) == 1: cost = mult_y.shape[1] else: cost = len(select_labs) label_ind.update(select_labs) unlab_ind.difference_update(select_labs) # train/test X_tr, y_tr, _ = get_Xy_in_multilabel(select_labs, X=X, y=mult_y, unknown_element=0) model.fit(X=X_tr, y=y_tr, is_incremental=True) pres, pred = model.predict(X[test_idx]) # using sklearn to calc micro-f1 pred[pred == -1] = 0 perf = f1_score(y_true=mult_y_for_metric[test_idx], y_pred=pred, average='micro') # save st = alibox.State(select_index=select_labs, performance=perf, cost=cost) saver.add_state(st) saver.save() return copy.deepcopy(saver)
def main_loop(alibox, round, strategy): train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) query_y = mult_y.copy() # base model model = LabelRankingModel() while len(label_ind) <= 120: # query and update select_labs = strategy.select(label_ind, unlab_ind) # use cost to record the amount of queried instance-label pairs if len(select_labs[0]) == 1: cost = mult_y.shape[1] else: cost = len(select_labs) label_ind.update(select_labs) unlab_ind.difference_update(select_labs) # train/test X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y) model.fit(X=X_tr, y=y_tr) pres, pred = model.predict(X[test_idx]) perf = alibox.calc_performance_metric( y_true=mult_y[test_idx], y_pred=pred, performance_metric='hamming_loss') # save st = alibox.State(select_index=select_labs, performance=perf, cost=cost) saver.add_state(st) return copy.deepcopy(saver)
def main_loop(alibox, strategy, round): # Get the data split of one fold experiment train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) while not stopping_criterion.is_stop(): # Select a subset of Uind according to the query strategy select_ind = strategy.select(label_ind, unlab_ind, cost=cost, budget=budget) # select_ind = hierarchical_multilabel_mark(select_ind, label_ind, label_tree, y) label_ind.update(select_ind) unlab_ind.difference_update(select_ind) # Update model and calc performance according to the model you are using X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=y) model.fit(X_tr, y_tr) pred = model.predict(X[test_idx, :]) pred[pred == 0] = 1 performance = alibox.calc_performance_metric( y_true=y[test_idx], y_pred=pred, performance_metric='hamming_loss') # Save intermediate results to file st = alibox.State(select_index=select_ind.index, performance=performance, cost=budget) saver.add_state(st) # Passing the current progress to stopping criterion object stopping_criterion.update_information(saver) # Reset the progress in stopping criterion object stopping_criterion.reset() return saver
query_y[select_ins, select_y2] = -1 elif y1 >= y2: query_y[select_ins, select_y1] = 1 query_y[select_ins, select_y2] = 0.5 else: query_y[select_ins, select_y1] = 0.5 query_y[select_ins, select_y2] = 1 # record results label_ind.update([(select_ins, select_y1), (select_ins, select_y2)]) unlab_ind.difference_update([(select_ins, select_y1), (select_ins, select_y2)]) if iter % 5 == 0: # train/test X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=query_y) model.fit(X=X_tr, y=y_tr) pres, pred = model.predict(X[test_idx]) perf = alibox.calc_performance_metric( y_true=mult_y[test_idx], y_pred=pred, performance_metric='hamming_loss') # save st = alibox.State(select_index=[(select_ins, select_y1), (select_ins, select_y2)], performance=perf) saver.add_state(st) AURO_results.append(copy.copy(saver))
def select(self, label_index, unlabel_index, epsilon=0.5, **kwargs): """Select a subset from the unlabeled set, return the selected instance and label. Parameters ---------- label_index: {list, np.ndarray, MultiLabelIndexCollection} The indexes of labeled samples. It should be a 1d array of indexes (column major, start from 0) or MultiLabelIndexCollection or a list of tuples with 2 elements, in which, the 1st element is the index of instance and the 2nd element is the index of labels. unlabel_index: {list, np.ndarray, MultiLabelIndexCollection} The indexes of unlabeled samples. It should be a 1d array of indexes (column major, start from 0) or MultiLabelIndexCollection or a list of tuples with 2 elements, in which, the 1st element is the index of instance and the 2nd element is the index of labels. epsilon: float, optional (default=0.5) The threshold to avoid zero-division. Returns ------- selected_ins_lab_pair: list A list of tuples that contains the indexes of selected instance-label pairs. """ if len(unlabel_index) <= 1: return unlabel_index unlabel_index = self._check_multi_label_ind(unlabel_index) label_index = self._check_multi_label_ind(label_index) # select instance by LCI W = unlabel_index.get_matrix_mask(mat_shape=self.y.shape, fill_value=1, sparse=False) unlab_data, _, data_ind = get_Xy_in_multilabel(index=unlabel_index, X=self.X, y=self.y) lab_data, lab_lab, _ = get_Xy_in_multilabel(index=label_index, X=self.X, y=self.y) self._lr_model.fit(lab_data, lab_lab) pres, labels = self._lr_model.predict(unlab_data) avgP = np.mean( np.sum(self.y[label_index.get_unbroken_instances(), :] == 1, axis=1)) insvals = -np.abs( (np.sum(labels == 1, axis=1) - avgP) / np.fmax(np.sum(W[data_ind, :] == 1, axis=1), epsilon)) selected_ins = np.argmin(insvals) # last line in pres is the predict value of dummy label # select label by calculating the distance between each label with dummy label # set the known entries to inf pres_mask = np.asarray(1 - W[data_ind], dtype=bool) pres_tmp = pres[:, 0:-1] pres_tmp[pres_mask] = np.NINF pres[:, 0:-1] = pres_tmp dis = np.abs(pres[selected_ins, 0:-1] - pres[selected_ins, -1]) selected_ins = data_ind[selected_ins] selected_lab = np.argmin(dis) return [(selected_ins, selected_lab)]
alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels') alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False) model = LabelRankingModel() # base model # query type strategy AURO_results = [] for round in range(5): train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) query_y = mult_y.copy() # for labeling `less relevant` AURO_strategy = QueryTypeAURO(X=X, y=mult_y) # init model X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y) model.fit(X=X_tr, y=y_tr) for iter in range(100): select_ins, select_y1, select_y2 = AURO_strategy.select(label_ind, unlab_ind, model=model, y_mat=query_y) # relevance y1 = mult_y[select_ins, select_y1] y2 = mult_y[select_ins, select_y2] if y1 < 0 and y2 < 0: query_y[select_ins, select_y1] = -1 query_y[select_ins, select_y2] = -1