def __init__(self, a=2.155, alpha=0.94, b=0.789, beta=1.0):

        super(DoubleBalance, self).__init__()
        self.a = a
        self.alpha = alpha
        self.b = b
        self.beta = beta
        self.fallback_model = SimpleBalance()
Пример #2
0
 def __init__(self,
              a=2.155,
              alpha=0.94,
              b=0.789,
              beta=1.0,
              random_state=None):
     super(DoubleBalance, self).__init__()
     self.a = a
     self.alpha = alpha
     self.b = b
     self.beta = beta
     self.fallback_model = SimpleBalance()
     self._random_state = get_random_state(random_state)
Пример #3
0
    def __init__(
        self,
        X,
        y=None,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        prior_included=[],
        prior_excluded=[],
        log_file=None,
        final_labels=None,
        verbose=1,
        data_fp=None,
    ):
        """ Initialize base class for systematic reviews.

        Arguments
        ---------
        X: np.array
            The feature matrix for the current dataset.
        y: np.array
            Labels of each paper, 1 for included, 0 for excluded.
            Can be set to None, to indicate inclusion data is not available.
        model: BaseModel
            Initialized model to fit the data during active learning.
            See asreview.models.utils.py for possible models.
        query_model: BaseQueryModel
            Initialized model to query new instances for review, such as random
            sampling or max sampling.
            See asreview.query_strategies.utils.py for query models.
        balance_model: BaseBalanceModel
            Initialized model to redistribute the training data during the
            active learning process. They might either resample or undersample
            specific papers.
        n_papers: int
            Number of papers to review during the active learning process,
            excluding the number of initial priors. To review all papers, set
            n_papers to None.
        n_instances: int
            Number of papers to query at each step in the active learning
            process.
        n_queries: int
            Number of steps/queries to perform. Set to None for no limit.
        prior_included: list
            List of papers (ids) that are included a priori.
        prior_excluded: list
            List of papers (ids) that are excluded a priori.
        log_file: str
            Path to log file.
        final_labels: np.array
            Final labels if we're using a two step inclusion process.
            For example, if at one step a paper is considered after reading the
            abstract and then at the second step, a final decision is made on
            the basis of the full text.
        """
        super(BaseReview, self).__init__()

        self.X = X
        self.y = y
        if y is None:
            self.y = np.full(X.shape[0], NOT_AVAILABLE)
        self.y = np.array(self.y, dtype=np.int)
        # Default to Naive Bayes model
        if model is None:
            model = NBModel()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            raise ValueError("Supply feature model!")

        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.log_file = log_file
        self.verbose = verbose

        self.prior_included = prior_included
        self.prior_excluded = prior_excluded

        self.query_i = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False
        self.data_fp = data_fp

        with open_logger(log_file) as logger:
            if not logger.is_empty():
                y, train_idx, query_src, query_i = logger.review_state()
                if X.shape[0] != len(y):
                    raise ValueError("The log file does not correspond to the "
                                     "given data file, please use another log "
                                     "file or dataset.")
                self.y = y
                self.train_idx = train_idx
                self.shared["query_src"] = query_src
                self.query_i = query_i
            else:
                if final_labels is not None:
                    logger.set_final_labels(final_labels)
                logger.set_labels(self.y)
                logger.add_settings(self.settings)
                self._prior_knowledge(logger)
                self.query_i = 0
Пример #4
0
    def __init__(
        self,
        as_data,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        start_idx=[],
        state_file=None,
        log_file=None,
    ):
        """Initialize base class for systematic reviews."""
        super(BaseReview, self).__init__()

        # Default to Naive Bayes model
        if model is None:
            model = NBModel()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            feature_model = Tfidf()

        self.as_data = as_data
        self.y = as_data.labels
        if self.y is None:
            self.y = np.full(len(as_data), LABEL_NA)
        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.start_idx = start_idx

        if log_file is not None:
            warnings.warn(
                "The log_file argument for BaseReview will be"
                " replaced by state_file.",
                category=FutureWarning)
            self.state_file = log_file
        else:
            self.state_file = state_file

        self.query_i = 0
        self.query_i_classified = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False

        # Restore the state from a file or initialize said file.
        with open_state(self.state_file) as state:
            # From file
            if not state.is_empty():
                startup = state.startup_vals()
                # If there are start indices not in the training add them.
                if not set(startup["train_idx"]) >= set(start_idx):
                    new_idx = list(set(start_idx) - set(startup["train_idx"]))
                    self.classify(new_idx,
                                  self.y[new_idx],
                                  state,
                                  method="initial")
                    startup = state.startup_vals()
                self.train_idx = startup["train_idx"]
                self.y = startup["labels"]
                self.shared["query_src"] = startup["query_src"]
                self.query_i = startup["query_i"]
                self.query_i_classified = startup["query_i_classified"]
            # From scratch
            else:
                state.set_labels(self.y)
                state.settings = self.settings
                self.classify(start_idx,
                              self.y[start_idx],
                              state,
                              method="initial")
                self.query_i_classified = len(start_idx)

            # Try to retrieve feature matrix from the state file.
            try:
                self.X = state.get_feature_matrix(as_data.hash())
            except KeyError:
                self.X = feature_model.fit_transform(as_data.texts,
                                                     as_data.headings,
                                                     as_data.bodies,
                                                     as_data.keywords)
                state._add_as_data(as_data, feature_matrix=self.X)
            if self.X.shape[0] != len(self.y):
                raise ValueError("The state file does not correspond to the "
                                 "given data file, please use another state "
                                 "file or dataset.")
            self.load_current_query(state)
class DoubleBalance(BaseBalance):
    """Class for the double balance strategy.

    Class to get the two way rebalancing function and arguments.
    It super samples ones depending on the number of 0's and total number
    of samples in the training data.

    Arguments
    ---------
    a: float
        Governs the weight of the 1's. Higher values mean linearly more 1's
        in your training sample.
    alpha: float
        Governs the scaling the weight of the 1's, as a function of the
        ratio of ones to zeros. A positive value means that the lower the
        ratio of zeros to ones, the higher the weight of the ones.
    b: float
        Governs how strongly we want to sample depending on the total
        number of samples. A value of 1 means no dependence on the total
        number of samples, while lower values mean increasingly stronger
        dependence on the number of samples.
    beta: float
        Governs the scaling of the weight of the zeros depending on the
        number of samples. Higher values means that larger samples are more
        strongly penalizing zeros.
    """

    name = "double"

    def __init__(self, a=2.155, alpha=0.94, b=0.789, beta=1.0):

        super(DoubleBalance, self).__init__()
        self.a = a
        self.alpha = alpha
        self.b = b
        self.beta = beta
        self.fallback_model = SimpleBalance()

    def sample(self, X, y, train_idx, shared):
        one_idx = train_idx[np.where(y[train_idx] == 1)]
        zero_idx = train_idx[np.where(y[train_idx] == 0)]

        # Fall back to simple sampling if we have only ones or zeros.
        if len(one_idx) == 0 or len(zero_idx) == 0:
            self.fallback_model.sample(X, y, train_idx, shared)

        n_one = len(one_idx)
        n_zero = len(zero_idx)
        n_train = n_one + n_zero

        # Compute the weights.
        one_weight = _one_weight(n_one, n_zero, self.a, self.alpha)
        zero_weight = _zero_weight(n_one + n_zero, self.b, self.beta)
        tot_zo_weight = one_weight * n_one + zero_weight * n_zero
        n_one_train = random_round(
            one_weight * n_one * n_train / tot_zo_weight)
        n_one_train = max(1, min(n_train - 2, n_one_train))
        n_zero_train = n_train - n_one_train

        # Get random ones and zeros.
        one_train_idx = fill_training(one_idx, n_one_train)
        zero_train_idx = fill_training(zero_idx, n_zero_train)

        all_idx = np.concatenate([one_train_idx, zero_train_idx])
        np.random.shuffle(all_idx)

        return X[all_idx], y[all_idx]

    def full_hyper_space(self):
        from hyperopt import hp
        parameter_space = {
            "bal_a": hp.lognormal("bal_a", 0, 1),
            "bal_alpha": hp.uniform("bal_alpha", 0, 2),
            "bal_b": hp.uniform("bal_b", 0, 1),
            # "bal_beta": hp.uniform("bal_beta", 0, 2),
        }
        return parameter_space, {}
Пример #6
0
    def __init__(
        self,
        as_data,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        start_idx=[],
        state_file=None,
        log_file=None,
        #                  final_labels=None,
        verbose=1,
        data_fp=None,
    ):
        """ Initialize base class for systematic reviews.

        Arguments
        ---------
        X: np.array
            The feature matrix for the current dataset.
        y: np.array
            Labels of each paper, 1 for included, 0 for excluded.
            Can be set to None, to indicate inclusion data is not available.
        model: BaseModel
            Initialized model to fit the data during active learning.
            See asreview.models.utils.py for possible models.
        query_model: BaseQueryModel
            Initialized model to query new instances for review, such as random
            sampling or max sampling.
            See asreview.query_strategies.utils.py for query models.
        balance_model: BaseBalanceModel
            Initialized model to redistribute the training data during the
            active learning process. They might either resample or undersample
            specific papers.
        n_papers: int
            Number of papers to review during the active learning process,
            excluding the number of initial priors. To review all papers, set
            n_papers to None.
        n_instances: int
            Number of papers to query at each step in the active learning
            process.
        n_queries: int
            Number of steps/queries to perform. Set to None for no limit.
        prior_included: list
            List of papers (ids) that are included a priori.
        prior_excluded: list
            List of papers (ids) that are excluded a priori.
        state_file: str
            Path to state file. Replaces log_file argument.
        final_labels: np.array
            Final labels if we're using a two step inclusion process.
            For example, if at one step a paper is considered after reading the
            abstract and then at the second step, a final decision is made on
            the basis of the full text.
        """
        super(BaseReview, self).__init__()

        # Default to Naive Bayes model
        if model is None:
            model = NBModel()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            feature_model = Tfidf()

        self.as_data = as_data
        self.y = as_data.labels
        if self.y is None:
            self.y = np.full(len(as_data), LABEL_NA)
        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.start_idx = start_idx

        if log_file is not None:
            warnings.warn(
                "The log_file argument for BaseReview will be"
                " replaced by state_file.",
                category=FutureWarning)
            self.state_file = log_file
        else:
            self.state_file = state_file
        self.verbose = verbose

        self.query_i = 0
        self.query_i_classified = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False
        self.data_fp = data_fp

        with open_state(self.state_file) as state:
            if not state.is_empty():
                startup = state.startup_vals()
                if not set(startup["train_idx"]) >= set(start_idx):
                    new_idx = list(set(start_idx) - set(startup["train_idx"]))
                    self.classify(new_idx,
                                  self.y[new_idx],
                                  state,
                                  method="initial")
                    startup = state.startup_vals()
                self.train_idx = startup["train_idx"]
                self.y = startup["labels"]
                self.shared["query_src"] = startup["query_src"]
                self.query_i = startup["query_i"]
                self.query_i_classified = startup["query_i_classified"]
            else:
                state.set_labels(self.y)
                state.settings = self.settings
                self.classify(start_idx,
                              self.y[start_idx],
                              state,
                              method="initial")
                self.query_i_classified = len(start_idx)

            try:
                self.X = state.get_feature_matrix(as_data.hash())
            except KeyError:
                self.X = feature_model.fit_transform(as_data.texts,
                                                     as_data.headings,
                                                     as_data.bodies,
                                                     as_data.keywords)
                state._add_as_data(as_data, feature_matrix=self.X)
            if self.X.shape[0] != len(self.y):
                raise ValueError("The state file does not correspond to the "
                                 "given data file, please use another state "
                                 "file or dataset.")
            self.load_current_query(state)
Пример #7
0
class DoubleBalance(BaseBalance):
    """Dynamic Resampling balance strategy.

    Class to get the two way rebalancing function and arguments.
    It super samples ones depending on the number of 0's and total number
    of samples in the training data.

    Arguments
    ---------
    a: float
        Governs the weight of the 1's. Higher values mean linearly more 1's
        in your training sample.
    alpha: float
        Governs the scaling the weight of the 1's, as a function of the
        ratio of ones to zeros. A positive value means that the lower the
        ratio of zeros to ones, the higher the weight of the ones.
    b: float
        Governs how strongly we want to sample depending on the total
        number of samples. A value of 1 means no dependence on the total
        number of samples, while lower values mean increasingly stronger
        dependence on the number of samples.
    beta: float
        Governs the scaling of the weight of the zeros depending on the
        number of samples. Higher values means that larger samples are more
        strongly penalizing zeros.
    """

    name = "double"

    def __init__(self,
                 a=2.155,
                 alpha=0.94,
                 b=0.789,
                 beta=1.0,
                 random_state=None):
        super(DoubleBalance, self).__init__()
        self.a = a
        self.alpha = alpha
        self.b = b
        self.beta = beta
        self.fallback_model = SimpleBalance()
        self._random_state = get_random_state(random_state)

    def sample(self, X, y, train_idx, shared):
        """Resample the training data.

        Arguments
        ---------
        X: np.array
            Complete feature matrix.
        y: np.array
            Labels for all papers.
        train_idx: np.array
            Training indices, that is all papers that have been reviewed.
        shared: dict
            Dictionary to share data between balancing models and other models.

        Returns
        -------
        np.array, np.array:
            X_train, y_train: the resampled matrix, labels.
        """
        # Get inclusions and exclusions
        one_idx = train_idx[np.where(y[train_idx] == 1)]
        zero_idx = train_idx[np.where(y[train_idx] == 0)]

        # Fall back to simple sampling if we have only ones or zeroes.
        if len(one_idx) == 0 or len(zero_idx) == 0:
            self.fallback_model.sample(X, y, train_idx, shared)

        n_one = len(one_idx)
        n_zero = len(zero_idx)
        n_train = n_one + n_zero

        # Compute sampling weights.
        one_weight = _one_weight(n_one, n_zero, self.a, self.alpha)
        zero_weight = _zero_weight(n_one + n_zero, self.b, self.beta)
        tot_zo_weight = one_weight * n_one + zero_weight * n_zero
        # Number of inclusions to sample.
        n_one_train = random_round(
            one_weight * n_one * n_train / tot_zo_weight, self._random_state)
        # Should be at least 1, and at least two spots should be for exclusions.
        n_one_train = max(1, min(n_train - 2, n_one_train))
        # Number of exclusions to sample
        n_zero_train = n_train - n_one_train

        # Sample records of ones and zeroes
        one_train_idx = fill_training(one_idx, n_one_train, self._random_state)
        zero_train_idx = fill_training(zero_idx, n_zero_train,
                                       self._random_state)
        # Merge and shuffle.
        all_idx = np.concatenate([one_train_idx, zero_train_idx])
        self._random_state.shuffle(all_idx)

        # Return resampled feature matrix and labels.
        return X[all_idx], y[all_idx]

    def full_hyper_space(self):
        from hyperopt import hp
        parameter_space = {
            "bal_a": hp.lognormal("bal_a", 0, 1),
            "bal_alpha": hp.uniform("bal_alpha", 0, 2),
            "bal_b": hp.uniform("bal_b", 0, 1),
            # "bal_beta": hp.uniform("bal_beta", 0, 2),
        }
        return parameter_space, {}
Пример #8
0
    def __init__(
        self,
        as_data,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        start_idx=[],
        state_file=None,
        log_file=None,
    ):
        """ Initialize base class for systematic reviews.

        Arguments
        ---------
        as_data: asreview.ASReviewData
            The data object which contains the text, labels, etc.
        model: BaseModel
            Initialized model to fit the data during active learning.
            See asreview.models.utils.py for possible models.
        query_model: BaseQueryModel
            Initialized model to query new instances for review, such as random
            sampling or max sampling.
            See asreview.query_strategies.utils.py for query models.
        balance_model: BaseBalanceModel
            Initialized model to redistribute the training data during the
            active learning process. They might either resample or undersample
            specific papers.
        feature_model: BaseFeatureModel
            Feature extraction model that converts texts and keywords to
            feature matrices.
        n_papers: int
            Number of papers to review during the active learning process,
            excluding the number of initial priors. To review all papers, set
            n_papers to None.
        n_instances: int
            Number of papers to query at each step in the active learning
            process.
        n_queries: int
            Number of steps/queries to perform. Set to None for no limit.
        start_idx: numpy.array
            Start the simulation/review with these indices. They are assumed to
            be already labeled. Failing to do so might result bad behaviour.
        state_file: str
            Path to state file. Replaces log_file argument.
        """
        super(BaseReview, self).__init__()

        # Default to Naive Bayes model
        if model is None:
            model = NBModel()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            feature_model = Tfidf()

        self.as_data = as_data
        self.y = as_data.labels
        if self.y is None:
            self.y = np.full(len(as_data), LABEL_NA)
        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.start_idx = start_idx

        if log_file is not None:
            warnings.warn(
                "The log_file argument for BaseReview will be"
                " replaced by state_file.",
                category=FutureWarning)
            self.state_file = log_file
        else:
            self.state_file = state_file

        self.query_i = 0
        self.query_i_classified = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False

        # Restore the state from a file or initialize said file.
        with open_state(self.state_file) as state:
            # From file
            if not state.is_empty():
                startup = state.startup_vals()
                # If there are start indices not in the training add them.
                if not set(startup["train_idx"]) >= set(start_idx):
                    new_idx = list(set(start_idx) - set(startup["train_idx"]))
                    self.classify(new_idx,
                                  self.y[new_idx],
                                  state,
                                  method="initial")
                    startup = state.startup_vals()
                self.train_idx = startup["train_idx"]
                self.y = startup["labels"]
                self.shared["query_src"] = startup["query_src"]
                self.query_i = startup["query_i"]
                self.query_i_classified = startup["query_i_classified"]
            # From scratch
            else:
                state.set_labels(self.y)
                state.settings = self.settings
                self.classify(start_idx,
                              self.y[start_idx],
                              state,
                              method="initial")
                self.query_i_classified = len(start_idx)

            # Try to retrieve feature matrix from the state file.
            try:
                self.X = state.get_feature_matrix(as_data.hash())
            except KeyError:
                self.X = feature_model.fit_transform(as_data.texts,
                                                     as_data.headings,
                                                     as_data.bodies,
                                                     as_data.keywords)
                state._add_as_data(as_data, feature_matrix=self.X)
            if self.X.shape[0] != len(self.y):
                raise ValueError("The state file does not correspond to the "
                                 "given data file, please use another state "
                                 "file or dataset.")
            self.load_current_query(state)