Exemplo n.º 1
0
    def main_loop(alibox, strategy, round):
        # Get the data split of one fold experiment
        train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)

        # train_idx = train_indexs[round]
        # test_idx = test_indexs[round]
        # label_ind = label_indexs[round]
        # unlab_ind = unlabel_indexs[round]
        # Get intermediate results saver for one fold experiment
        saver = alibox.get_stateio(round)

        # To balance such effects that QueryMeta need to select the first five rounds selection
        temp_rand = QueryRandom(X, y)
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        for i in range(5):
            rand_select_ind = temp_rand.select(label_ind, unlab_ind) 
            label_ind.update(rand_select_ind)
            unlab_ind.difference_update(rand_select_ind)
            model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        # label_ind = copy.deepcopy(label_index_round[round][4])
        # unlab_ind = copy.deepcopy(unlabel_index_round[round][4])

        # model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = sum(pred == y[test_idx]) / len(test_idx)
        saver.set_initial_point(accuracy)

        while not stopping_criterion.is_stop():
            # Select a subset of Uind according to the query strategy
            # Passing model=None to use the default model for evaluating the committees' disagreement
            select_ind = strategy.select(label_ind, unlab_ind, model=model, batch_size=1)
            label_ind.update(select_ind)
            unlab_ind.difference_update(select_ind)

            # Update model and calc performance according to the model you are using
            model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
            pred = model.predict(X[test_idx, :])
            accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                    y_pred=pred,
                                                    performance_metric='accuracy_score')

            # Save intermediate results to file
            st = alibox.State(select_index=select_ind, performance=accuracy)
            saver.add_state(st)

            # Passing the current progress to stopping criterion object
            stopping_criterion.update_information(saver)
        # Reset the progress in stopping criterion object
        stopping_criterion.reset()
        return saver
Exemplo n.º 2
0
    for round in range(5):
        # Get the data split of one fold experiment
        train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
        # Get intermediate results saver for one fold experiment
        saver = alibox.get_stateio(round)
        # calc the initial point
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = sum(pred == y[test_idx]) / len(test_idx)
        saver.set_initial_point(accuracy)

        while not stopping_criterion.is_stop():
            # Select a subset of Uind according to the query strategy
            # Passing model=None to use the default model for evaluating the committees' disagreement
            select_ind = random.select(unlab_ind)
            label_ind.update(select_ind)
            unlab_ind.difference_update(select_ind)

            # Update model and calc performance according to the model you are using
            model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
            pred = model.predict(X[test_idx, :])
            accuracy = alibox.calc_performance_metric(
                y_true=y[test_idx],
                y_pred=pred,
                performance_metric='accuracy_score')

            # Save intermediate results to file
            st = alibox.State(select_index=select_ind, performance=accuracy)
            saver.add_state(st)
            saver.save()
Exemplo n.º 3
0
    def select(self,
               label_index,
               unlabel_index,
               model=None,
               xb_way='uncertainty'):
        """Select indexes from the unlabel_index for querying.

        Parameters
        ----------
        label_index: {list, np.ndarray, IndexCollection}
            The indexes of labeled samples.

        unlabel_index: {list, np.ndarray, IndexCollection}
            The indexes of unlabeled samples.

        model: object, optional (default=None)
            Current classification model, should have the 'predict_proba' method for probabilistic output.
            If not provided, LogisticRegression with default parameters implemented by sklearn will be used.

        Returns
        -------
        selected_idx: int
            The selected index.
        """
        if model is None:
            model = LogisticRegression()
        if self.flag is False:
            self.get_5_rouds(label_index, unlabel_index, model)

        label_ind = copy.deepcopy(self.label_inds_5[4])
        unlabel_ind = copy.deepcopy(self.unlabel_inds_5[4])

        # select x^ by unncertainty for combining the [x*, x^] c_data
        # using uncertainty to select x^
        if xb_way is 'uncertainty':
            un = QueryInstanceUncertainty(self.X, self.y)
            selectedind = un.select(label_ind, unlabel_ind, model)
        elif xb_way is 'random':
            rand = QueryRandom(self.X, self.y)
            selectedind = rand.select(label_ind, unlabel_ind)[0]
        else:
            raise Exception(
                'calculating the xb at least one of [uncertrainty, random]')

        # using random to select x^
        # rand = QueryRandom(self.X, self.y)
        # rand_selectedind = rand.select(label_ind, unlabel_ind)

        # cd_second = meta_data(self.X, self.y, self.distacne, self.cluster_center_index, self.label_inds_5, self.unlabel_inds_5, self.modelOutput_5, un_selectedind)
        metadata = self.cal_mate_data_Z(self.label_inds_5, self.unlabel_inds_5,
                                        self.modelOutput_5, model)

        # if np.where(self.unlabel_inds_5[4] == un_selectedind)[0] > 0:
        #     metadata_unind = np.where(self.unlabel_inds_5[4] == un_selectedind)[0][0]
        #     cd_second = metadata[metadata_unind]
        # else:
        #     l_ind = copy.deepcopy(self.label_inds_5[4])
        #     u_ind = copy.deepcopy(self.unlabel_inds_5[4])
        #     l_ind.

        # metadata_unind = np.where(self.unlabel_inds_5[4] == selectedind)[0][0]

        metadata_unind = np.where(unlabel_ind == selectedind)[0][0]
        cd_second = metadata[metadata_unind]
        num_unlabeled = len(metadata)
        cd_second = np.tile(cd_second, [num_unlabeled, 1])
        combination_data = np.c_[metadata, cd_second]

        predict_proba = self.cb_classifier.predict_proba(combination_data)
        select = np.argmax(predict_proba[:, 1])
        # metareg_perdict = self.metaregressor.predict(metadata)
        # print('len(metareg_perdict) ',len(metareg_perdict))
        # select = np.argmax(metareg_perdict)
        # print('select ',select)
        # print('len(unlabel_ind)',len(unlabel_ind))

        select_ind = unlabel_ind[select]
        label_ind.update(select_ind)
        unlabel_ind.difference_update(select_ind)
        model.fit(X=self.X[label_index.index, :], y=self.y[label_index.index])

        # update the five rounds infor before
        del self.label_inds_5[0]
        del self.unlabel_inds_5[0]
        del self.modelOutput_5[0]

        self.label_inds_5.append(label_ind)
        self.unlabel_inds_5.append(unlabel_ind)
        if hasattr(model, 'predict_proba'):
            output = (model.predict_proba(self.X)[:, 1] - 0.5) * 2
        else:
            output = model.predict(self.X)
        self.modelOutput_5.append(output)

        return select_ind, copy.deepcopy(self.label_inds_5[4]), copy.deepcopy(
            self.unlabel_inds_5[4])
Exemplo n.º 4
0
    # generate the first five rounds data(label_index unlabel_index model_output)
    label_index_round = []
    unlabel_index_round = []
    model_output_round = []

    for round in range(splitcount):
        label_inds_5 = []
        unlabel_inds_5 = []
        model_output_5 = []

        train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
        temp_rand = QueryRandom(X, y)
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        for i in range(5):
            rand_select_ind = temp_rand.select(label_ind, unlab_ind) 
            label_ind.update(rand_select_ind)
            unlab_ind.difference_update(rand_select_ind)
            label_inds_5.append(copy.deepcopy(label_ind))
            unlabel_inds_5.append(copy.deepcopy(unlab_ind))
            model.fit(X=X[label_ind.index, :], y=y[label_ind.index])  
            if hasattr(model, 'predict_proba'):
                output = (model.predict_proba(X)[:, 1] - 0.5) * 2
            else:
                output = model.predict(X)    
            model_output_5.append(output)
        
        label_index_round.append(label_inds_5)
        unlabel_index_round.append(unlabel_inds_5)
        model_output_round.append(model_output_5)
Exemplo n.º 5
0
    def get_5_rouds(self,
                    label_ind,
                    unlabel_ind,
                    Model,
                    querystategy='random'):
        """
        label_ind: {list, np.ndarray, IndexCollection}
            The indexes of labeled samples.

        unlabel_ind: {list, np.ndarray, IndexCollection}
            The indexes of unlabeled samples.

        model: object, optional (default=None)
            Current classification model, should have the 'predict_proba' method for probabilistic output.
            If not provided, LogisticRegression with default parameters implemented by sklearn will be used.
        
        querystategy: str, default='uncertainty'
            In the first five rounds of active learning,choose to select the query strategy.
            Currently only supported uncertainty and random
        """
        assert (isinstance(label_ind, IndexCollection))
        assert (isinstance(unlabel_ind, IndexCollection))
        label_index = copy.deepcopy(label_ind)
        unlabel_index = copy.deepcopy(unlabel_ind)
        model = copy.deepcopy(Model)

        if querystategy == 'uncertainty':
            un = QueryInstanceUncertainty(self.X, self.y)
            for _ in range(5):
                select_ind = un.select(label_index, unlabel_index, model=model)
                label_index.update(select_ind)
                unlabel_index.difference_update(select_ind)
                self.label_inds_5.append(copy.deepcopy(label_index))
                self.unlabel_inds_5.append(copy.deepcopy(unlabel_index))
                model.fit(X=self.X[label_index.index, :],
                          y=self.y[label_index.index])
                self.modelOutput_5.append(model.predict(self.X))

        elif querystategy == 'random':
            random = QueryRandom(self.X, self.y)
            for _ in range(5):
                select_ind = random.select(label_index, unlabel_index)
                label_index.update(select_ind)
                unlabel_index.difference_update(select_ind)
                self.label_inds_5.append(copy.deepcopy(label_index))
                self.unlabel_inds_5.append(copy.deepcopy(unlabel_index))
                model.fit(X=self.X[label_index.index, :],
                          y=self.y[label_index.index])

                if hasattr(model, 'predict_proba'):
                    output = (model.predict_proba(self.X)[:, 1] - 0.5) * 2
                else:
                    output = model.predict(self.X)

                # self.modelOutput_5.append(model.predict(self.X))
                self.modelOutput_5.append(output)

        elif querystategy is None:
            for _ in range(5):
                num_label = len(label_index.index)
                num_unlabel = len(unlabel_index.index)
                n_samples = np.shape(self.X)[0]
                self.label_inds_5.append(np.zeros(num_label))
                self.unlabel_inds_5.append(np.zeros(num_unlabel))
                self.modelOutput_5.append(np.zeros(n_samples))
        self.flag = True