def labelInstances(self, select_ind, client: Client = None, verbose=False): # For each selected instance retrieve from the simOracle the labeled instances labels, cost = self._oracle.query(instances=self._X[select_ind], indexes=select_ind) labels_iterator = zip(select_ind, labels) for item in labels_iterator: item_shape = np.shape(item[1]) if isinstance( item[1], (list, np.ndarray)) else da.shape(item[1]) if len(item_shape) == len(da.shape(np.asarray(labels))): new_item = item[1] else: new_item = [item[1]] if item[0] == 0: # choose the first item result = da.concatenate([new_item, self._Y[item[0] + 1:]], axis=0) elif item[0] == len(self._Y) - 1: # choose the last item result = da.concatenate([self._Y[:item[0]], new_item], axis=0) else: # any other item result = da.concatenate( [self._Y[:item[0]], new_item, self._Y[item[0] + 1:]], axis=0) self._Y = result.persist() if client is not None: client.rebalance(self._Y) if verbose: print("Label: %s, Cost: %s" % (labels, cost))
def __init__(self, client: Client, X, Y, ml_technique, scenario_type: AbstractScenario, performance_metrics: [], query_strategy: SingleLabelIndexQuery, oracle: Oracle, stopping_criteria: AbstractStopCriterion, self_partition: bool, kfolds: int = 1, batch_size=1, **kwargs): """ Parameters ---------- :param client: distributed.Client :param X: array-like Data matrix with [n_samples, n_features] :param Y: array-like, optional labels of given data [n_samples, n_labels] or [n_samples] :param ml_technique :param scenario_type: Sub-Type of AbstractScenario Type of Active Learning scenario to use :param performance_metrics: array-like of BaseMetrics elements :param query_strategy: SinlgeLabelIndexQuery :param oracle: Oracle :param stopping_criteria: AbstractStopCriterion :param self_partition: bool :param kfolds: int, optional (default=1) If self_partition is True Random split data k sets according to the extra parameters -> test_ratio: float, optional (default=0.3) Ratio of test set -> initial_label_rate: float, optional (default=0.05) Ratio of initial label set e.g. Initial_labelset*(1-test_ratio)*n_samples -> all_class: bool, optional (default=True) Whether each split will contain at least one instance for each class. If False, a totally random split will be performed. If self_partition is False the following the following parameter must be specified -> train_idx: -> test_idx: -> label_idx: -> unlabel_idx: :param kwargs: optional Extra parameters """ self._client = client if type(X) is da.core.Array: self._X = X.persist() else: self._X = da.from_array(X, chunks=len(X) // 50).persist() if isinstance(Y, da.core.Array): self._Y = Y.persist() else: self._Y = da.from_array(Y, chunks=len(Y) // 50).persist() # Persists the Dask Storage Structures if client is not None and kwargs.pop("rebalance", False): client.rebalance(self._X) client.rebalance(self._Y) check_X_y(self._X, self._Y, accept_sparse='csc', multi_output=True, distributed=False) self._scenario_type = scenario_type if self._scenario_type is None: raise ValueError("required param 'scenario_type' can not be empty") if not issubclass(self._scenario_type, AbstractScenario): raise ValueError( "the 'scenario_type' must be a subclass of 'AbstractScenario'") if self_partition: self._kfolds = kfolds self._train_idx, self._test_idx, self._label_idx, self._unlabel_idx = split( X=self._X, y=self._Y, test_ratio=kwargs.pop("test_ratio", 0.3), initial_label_rate=kwargs.pop("initial_label_rate", 0.05), split_count=self._kfolds, all_class=kwargs.pop("all_class", True)) else: train_idx = kwargs.pop("train_idx", None) test_idx = kwargs.pop("test_idx", None) label_idx = kwargs.pop("label_idx", None) unlabel_idx = kwargs.pop("unlabel_idx", None) if train_idx is None: raise ValueError( "required param 'train_idx' can not be empty ") if test_idx is None: raise ValueError("required param 'test_idx' can not be empty ") if label_idx is None: raise ValueError( "required param 'label_idx' can not be empty ") if unlabel_idx is None: raise ValueError( "required param 'unlabel_idx' can not be empty ") num_inst_x, num_feat = da.shape(self._X) num_inst_y, num_labels = da.shape( self._Y) if len(da.shape(self._Y)) > 1 else (da.shape( self._Y)[0], 1) folds_train, num_inst_train = np.shape(train_idx) folds_test, num_inst_test = np.shape(test_idx) folds_labeled, num_inst_labeled = np.shape(label_idx) folds_unlabeled, num_inst_unlabeled = np.shape(unlabel_idx) if num_inst_x != num_inst_y: raise ValueError( "Different numbers of instances for inputs (x:%s, y:%s)" % (num_inst_x, num_inst_y)) if folds_train != folds_test or folds_test != folds_labeled or folds_labeled != folds_unlabeled: raise ValueError( "Different numbers of folds for inputs (train_idx:%s, test_idx:%s " "label_idx:%s, unlabel_idx:%s)" % (folds_train, folds_test, folds_labeled, folds_unlabeled)) if kfolds != folds_test: raise ValueError( "Number of folds for inputs (train_idx:%s, test_idx:%s " "label_idx:%s, unlabel_idx:%s) must be equals to kfolds:%s param" % (folds_train, folds_test, folds_labeled, folds_unlabeled, kfolds)) if num_inst_train + num_inst_test != num_inst_x: raise ValueError( "The sum of the number of instances for train_idx and test_idx must be equal to the " "number of instances for x" "(num_inst_x:%s, num_inst_train:%s num_inst_test:%s)" % (num_inst_x, num_inst_train, num_inst_test)) if num_inst_labeled + num_inst_unlabeled != num_inst_train: raise ValueError( "The sum of the number of instances for label_idx and unlabel_idx must be equal to the " "number of instances for train_idx" "(num_inst_labeled:%s, num_inst_unlabeled:%s num_inst_unlabeled:%s)" % (num_inst_labeled, num_inst_unlabeled, num_inst_unlabeled)) self._kfolds = folds_train self._train_idx = train_idx self._test_idx = test_idx self._label_idx = label_idx self._unlabel_idx = unlabel_idx self._ml_technique = ml_technique if self._ml_technique is None: raise ValueError("required param 'ml_technique' can not be empty") self._performance_metrics = performance_metrics if self._performance_metrics is None or len( self._performance_metrics) == 0: raise ValueError( "required param 'performance_metric' can not be empty") else: for metric in self._performance_metrics: if not isinstance(metric, BaseMetrics): raise ValueError( "the elements in 'performance_metrics' must be of type BaseMetrics" ) self._query_strategy = query_strategy if self._query_strategy is None: raise ValueError( "required param 'query_strategy' can not be empty") self._oracle = oracle if self._oracle is None: raise ValueError("required param 'simOracle' can not be empty") self._stopping_criteria = stopping_criteria if self._stopping_criteria is None: raise ValueError( "required param 'stopping_criteria' can not be empty") # Dynamically create the scenario Type given the arguments importlib.import_module(self._scenario_type.__module__) self._scenario = eval(self._scenario_type.__qualname__)( X=self._X, y=self._Y, train_idx=self._train_idx[0], test_idx=self._test_idx[0], label_idx=copy.deepcopy(IndexCollection(self._label_idx[0])), unlabel_idx=copy.deepcopy(IndexCollection(self._unlabel_idx[0])), ml_technique=self._ml_technique, performance_metrics=self._performance_metrics, query_strategy=self._query_strategy, oracle=self._oracle, batch_size=batch_size)