def __init__(self, cluster_size=350, update_interval=200, random_state=None, **kwargs): """Initialize the clustering strategy. Arguments --------- texts: list List of sequences to create feature matrix. cluster_size: int Size of the clusters to be made. If the size of the clusters is smaller than the size of the pool, fall back to max sampling. update_cluster: int Update the clustering every x instances. **kwargs: dict Keyword arguments for the doc2vec feature model. """ super(ClusterQuery, self).__init__() self.cluster_size = cluster_size self.update_interval = update_interval self.last_update = None self.fallback_model = MaxQuery() self._random_state = get_random_state(random_state)
def __init__(self, cluster_size=350, update_interval=200, random_state=None): """Initialize the clustering strategy. """ super(ClusterQuery, self).__init__() self.cluster_size = cluster_size self.update_interval = update_interval self.last_update = None self.fallback_model = MaxQuery() self._random_state = get_random_state(random_state)
def __init__(self, cluster_size=350, update_interval=200, random_state=None): """Initialize the clustering strategy. Arguments --------- cluster_size: int Size of the clusters to be made. If the size of the clusters is smaller than the size of the pool, fall back to max sampling. update_interval: int Update the clustering every x instances. random_state: int, RandomState State/seed of the RNG. """ super(ClusterQuery, self).__init__() self.cluster_size = cluster_size self.update_interval = update_interval self.last_update = None self.fallback_model = MaxQuery() self._random_state = get_random_state(random_state)
def __init__( self, X, y=None, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, prior_included=[], prior_excluded=[], log_file=None, final_labels=None, verbose=1, data_fp=None, ): """ Initialize base class for systematic reviews. Arguments --------- X: np.array The feature matrix for the current dataset. y: np.array Labels of each paper, 1 for included, 0 for excluded. Can be set to None, to indicate inclusion data is not available. model: BaseModel Initialized model to fit the data during active learning. See asreview.models.utils.py for possible models. query_model: BaseQueryModel Initialized model to query new instances for review, such as random sampling or max sampling. See asreview.query_strategies.utils.py for query models. balance_model: BaseBalanceModel Initialized model to redistribute the training data during the active learning process. They might either resample or undersample specific papers. n_papers: int Number of papers to review during the active learning process, excluding the number of initial priors. To review all papers, set n_papers to None. n_instances: int Number of papers to query at each step in the active learning process. n_queries: int Number of steps/queries to perform. Set to None for no limit. prior_included: list List of papers (ids) that are included a priori. prior_excluded: list List of papers (ids) that are excluded a priori. log_file: str Path to log file. final_labels: np.array Final labels if we're using a two step inclusion process. For example, if at one step a paper is considered after reading the abstract and then at the second step, a final decision is made on the basis of the full text. """ super(BaseReview, self).__init__() self.X = X self.y = y if y is None: self.y = np.full(X.shape[0], NOT_AVAILABLE) self.y = np.array(self.y, dtype=np.int) # Default to Naive Bayes model if model is None: model = NBModel() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: raise ValueError("Supply feature model!") self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.log_file = log_file self.verbose = verbose self.prior_included = prior_included self.prior_excluded = prior_excluded self.query_i = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False self.data_fp = data_fp with open_logger(log_file) as logger: if not logger.is_empty(): y, train_idx, query_src, query_i = logger.review_state() if X.shape[0] != len(y): raise ValueError("The log file does not correspond to the " "given data file, please use another log " "file or dataset.") self.y = y self.train_idx = train_idx self.shared["query_src"] = query_src self.query_i = query_i else: if final_labels is not None: logger.set_final_labels(final_labels) logger.set_labels(self.y) logger.add_settings(self.settings) self._prior_knowledge(logger) self.query_i = 0
class ClusterQuery(ProbaQueryStrategy): "Query strategy using clustering algorithms." name = "cluster" def __init__(self, cluster_size=350, update_interval=200, **kwargs): """Initialize the clustering strategy. Arguments --------- texts: list List of sequences to create feature matrix. cluster_size: int Size of the clusters to be made. If the size of the clusters is smaller than the size of the pool, fall back to max sampling. update_cluster: int Update the clustering every x instances. **kwargs: dict Keyword arguments for the doc2vec feature model. """ super(ClusterQuery, self).__init__() self.cluster_size = cluster_size self.update_interval = update_interval self.last_update = None self.fallback_model = MaxQuery() def _query(self, X, pool_idx, n_instances, proba): n_samples = X.shape[0] if pool_idx is None: pool_idx = np.arange(n_samples) last_update = self.last_update if (last_update is None or self.update_interval is None or last_update - len(pool_idx) >= self.update_interval): n_clusters = round(len(pool_idx) / self.cluster_size) if n_clusters <= 1: return self.fallback_model._query(X, pool_idx=pool_idx, n_instances=n_instances, proba=proba) model = KMeans(n_clusters=n_clusters, n_init=1) self.clusters = model.fit_predict(X) self.last_update = len(pool_idx) clusters = {} for idx in pool_idx: cluster_id = self.clusters[idx] if cluster_id in clusters: clusters[cluster_id].append((idx, proba[idx, 1])) else: clusters[cluster_id] = [(idx, proba[idx, 1])] for cluster_id in clusters: try: clusters[cluster_id] = sorted(clusters[cluster_id], key=lambda x: x[1]) except ValueError: raise clust_idx = [] cluster_ids = list(clusters) for _ in range(n_instances): cluster_id = np.random.choice(cluster_ids, 1)[0] clust_idx.append(clusters[cluster_id].pop()[0]) if len(clusters[cluster_id]) == 0: del clusters[cluster_id] cluster_ids = list(clusters) clust_idx = np.array(clust_idx, dtype=int) return clust_idx, X[clust_idx] def full_hyper_space(self): from hyperopt import hp parameter_space = { "qry_cluster_size": hp.quniform('qry_cluster_size', 50, 1000, 1), "qry_update_cluster": hp.quniform('qry_update_cluster', 100, 300, 1), "qry_max_frac": hp.uniform('qry_max_frac', 0, 1), } return parameter_space, {}
def __init__( self, as_data, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, start_idx=[], state_file=None, log_file=None, ): """Initialize base class for systematic reviews.""" super(BaseReview, self).__init__() # Default to Naive Bayes model if model is None: model = NBModel() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: feature_model = Tfidf() self.as_data = as_data self.y = as_data.labels if self.y is None: self.y = np.full(len(as_data), LABEL_NA) self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.start_idx = start_idx if log_file is not None: warnings.warn( "The log_file argument for BaseReview will be" " replaced by state_file.", category=FutureWarning) self.state_file = log_file else: self.state_file = state_file self.query_i = 0 self.query_i_classified = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False # Restore the state from a file or initialize said file. with open_state(self.state_file) as state: # From file if not state.is_empty(): startup = state.startup_vals() # If there are start indices not in the training add them. if not set(startup["train_idx"]) >= set(start_idx): new_idx = list(set(start_idx) - set(startup["train_idx"])) self.classify(new_idx, self.y[new_idx], state, method="initial") startup = state.startup_vals() self.train_idx = startup["train_idx"] self.y = startup["labels"] self.shared["query_src"] = startup["query_src"] self.query_i = startup["query_i"] self.query_i_classified = startup["query_i_classified"] # From scratch else: state.set_labels(self.y) state.settings = self.settings self.classify(start_idx, self.y[start_idx], state, method="initial") self.query_i_classified = len(start_idx) # Try to retrieve feature matrix from the state file. try: self.X = state.get_feature_matrix(as_data.hash()) except KeyError: self.X = feature_model.fit_transform(as_data.texts, as_data.headings, as_data.bodies, as_data.keywords) state._add_as_data(as_data, feature_matrix=self.X) if self.X.shape[0] != len(self.y): raise ValueError("The state file does not correspond to the " "given data file, please use another state " "file or dataset.") self.load_current_query(state)
def __init__( self, as_data, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, start_idx=[], state_file=None, log_file=None, # final_labels=None, verbose=1, data_fp=None, ): """ Initialize base class for systematic reviews. Arguments --------- X: np.array The feature matrix for the current dataset. y: np.array Labels of each paper, 1 for included, 0 for excluded. Can be set to None, to indicate inclusion data is not available. model: BaseModel Initialized model to fit the data during active learning. See asreview.models.utils.py for possible models. query_model: BaseQueryModel Initialized model to query new instances for review, such as random sampling or max sampling. See asreview.query_strategies.utils.py for query models. balance_model: BaseBalanceModel Initialized model to redistribute the training data during the active learning process. They might either resample or undersample specific papers. n_papers: int Number of papers to review during the active learning process, excluding the number of initial priors. To review all papers, set n_papers to None. n_instances: int Number of papers to query at each step in the active learning process. n_queries: int Number of steps/queries to perform. Set to None for no limit. prior_included: list List of papers (ids) that are included a priori. prior_excluded: list List of papers (ids) that are excluded a priori. state_file: str Path to state file. Replaces log_file argument. final_labels: np.array Final labels if we're using a two step inclusion process. For example, if at one step a paper is considered after reading the abstract and then at the second step, a final decision is made on the basis of the full text. """ super(BaseReview, self).__init__() # Default to Naive Bayes model if model is None: model = NBModel() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: feature_model = Tfidf() self.as_data = as_data self.y = as_data.labels if self.y is None: self.y = np.full(len(as_data), LABEL_NA) self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.start_idx = start_idx if log_file is not None: warnings.warn( "The log_file argument for BaseReview will be" " replaced by state_file.", category=FutureWarning) self.state_file = log_file else: self.state_file = state_file self.verbose = verbose self.query_i = 0 self.query_i_classified = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False self.data_fp = data_fp with open_state(self.state_file) as state: if not state.is_empty(): startup = state.startup_vals() if not set(startup["train_idx"]) >= set(start_idx): new_idx = list(set(start_idx) - set(startup["train_idx"])) self.classify(new_idx, self.y[new_idx], state, method="initial") startup = state.startup_vals() self.train_idx = startup["train_idx"] self.y = startup["labels"] self.shared["query_src"] = startup["query_src"] self.query_i = startup["query_i"] self.query_i_classified = startup["query_i_classified"] else: state.set_labels(self.y) state.settings = self.settings self.classify(start_idx, self.y[start_idx], state, method="initial") self.query_i_classified = len(start_idx) try: self.X = state.get_feature_matrix(as_data.hash()) except KeyError: self.X = feature_model.fit_transform(as_data.texts, as_data.headings, as_data.bodies, as_data.keywords) state._add_as_data(as_data, feature_matrix=self.X) if self.X.shape[0] != len(self.y): raise ValueError("The state file does not correspond to the " "given data file, please use another state " "file or dataset.") self.load_current_query(state)
def __init__( self, as_data, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, start_idx=[], state_file=None, log_file=None, ): """ Initialize base class for systematic reviews. Arguments --------- as_data: asreview.ASReviewData The data object which contains the text, labels, etc. model: BaseModel Initialized model to fit the data during active learning. See asreview.models.utils.py for possible models. query_model: BaseQueryModel Initialized model to query new instances for review, such as random sampling or max sampling. See asreview.query_strategies.utils.py for query models. balance_model: BaseBalanceModel Initialized model to redistribute the training data during the active learning process. They might either resample or undersample specific papers. feature_model: BaseFeatureModel Feature extraction model that converts texts and keywords to feature matrices. n_papers: int Number of papers to review during the active learning process, excluding the number of initial priors. To review all papers, set n_papers to None. n_instances: int Number of papers to query at each step in the active learning process. n_queries: int Number of steps/queries to perform. Set to None for no limit. start_idx: numpy.array Start the simulation/review with these indices. They are assumed to be already labeled. Failing to do so might result bad behaviour. state_file: str Path to state file. Replaces log_file argument. """ super(BaseReview, self).__init__() # Default to Naive Bayes model if model is None: model = NBModel() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: feature_model = Tfidf() self.as_data = as_data self.y = as_data.labels if self.y is None: self.y = np.full(len(as_data), LABEL_NA) self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.start_idx = start_idx if log_file is not None: warnings.warn( "The log_file argument for BaseReview will be" " replaced by state_file.", category=FutureWarning) self.state_file = log_file else: self.state_file = state_file self.query_i = 0 self.query_i_classified = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False # Restore the state from a file or initialize said file. with open_state(self.state_file) as state: # From file if not state.is_empty(): startup = state.startup_vals() # If there are start indices not in the training add them. if not set(startup["train_idx"]) >= set(start_idx): new_idx = list(set(start_idx) - set(startup["train_idx"])) self.classify(new_idx, self.y[new_idx], state, method="initial") startup = state.startup_vals() self.train_idx = startup["train_idx"] self.y = startup["labels"] self.shared["query_src"] = startup["query_src"] self.query_i = startup["query_i"] self.query_i_classified = startup["query_i_classified"] # From scratch else: state.set_labels(self.y) state.settings = self.settings self.classify(start_idx, self.y[start_idx], state, method="initial") self.query_i_classified = len(start_idx) # Try to retrieve feature matrix from the state file. try: self.X = state.get_feature_matrix(as_data.hash()) except KeyError: self.X = feature_model.fit_transform(as_data.texts, as_data.headings, as_data.bodies, as_data.keywords) state._add_as_data(as_data, feature_matrix=self.X) if self.X.shape[0] != len(self.y): raise ValueError("The state file does not correspond to the " "given data file, please use another state " "file or dataset.") self.load_current_query(state)
class ClusterQuery(ProbaQueryStrategy): """Query strategy using clustering algorithms. Use clustering after feature extraction on the dataset. Then the highest probabilities within random clusters are sampled. Arguments --------- cluster_size: int Size of the clusters to be made. If the size of the clusters is smaller than the size of the pool, fall back to max sampling. update_interval: int Update the clustering every x instances. random_state: int, RandomState State/seed of the RNG. """ name = "cluster" def __init__(self, cluster_size=350, update_interval=200, random_state=None): """Initialize the clustering strategy. """ super(ClusterQuery, self).__init__() self.cluster_size = cluster_size self.update_interval = update_interval self.last_update = None self.fallback_model = MaxQuery() self._random_state = get_random_state(random_state) def _query(self, X, pool_idx, n_instances, proba): n_samples = X.shape[0] if pool_idx is None: pool_idx = np.arange(n_samples) last_update = self.last_update if (last_update is None or self.update_interval is None or last_update - len(pool_idx) >= self.update_interval): n_clusters = round(len(pool_idx) / self.cluster_size) if n_clusters <= 1: return self.fallback_model._query(X, pool_idx=pool_idx, n_instances=n_instances, proba=proba) model = KMeans(n_clusters=n_clusters, n_init=1, random_state=self._random_state) self.clusters = model.fit_predict(X) self.last_update = len(pool_idx) clusters = {} for idx in pool_idx: cluster_id = self.clusters[idx] if cluster_id in clusters: clusters[cluster_id].append((idx, proba[idx, 1])) else: clusters[cluster_id] = [(idx, proba[idx, 1])] for cluster_id in clusters: try: clusters[cluster_id] = sorted(clusters[cluster_id], key=lambda x: x[1]) except ValueError: raise clust_idx = [] cluster_ids = list(clusters) for _ in range(n_instances): cluster_id = self._random_state.choice(cluster_ids, 1)[0] clust_idx.append(clusters[cluster_id].pop()[0]) if len(clusters[cluster_id]) == 0: del clusters[cluster_id] cluster_ids = list(clusters) clust_idx = np.array(clust_idx, dtype=int) return clust_idx, X[clust_idx] def full_hyper_space(self): from hyperopt import hp parameter_space = { "qry_cluster_size": hp.quniform('qry_cluster_size', 50, 1000, 1), "qry_update_interval": hp.quniform('qry_update_interval', 100, 300, 1), } return parameter_space, {}