def get_reviewer(dataset, mode='oracle', model=DEFAULT_MODEL, query_strategy=DEFAULT_QUERY_STRATEGY, balance_strategy=DEFAULT_BALANCE_STRATEGY, n_instances=DEFAULT_N_INSTANCES, n_queries=1, embedding_fp=None, verbose=1, prior_included=None, prior_excluded=None, n_prior_included=DEFAULT_N_PRIOR_INCLUDED, n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED, config_file=None, src_log_fp=None, **kwargs): # Find the URL of the datasets if the dataset is an example dataset. if dataset in DEMO_DATASETS.keys(): dataset = DEMO_DATASETS[dataset] if src_log_fp is not None: logger = Logger(log_fp=src_log_fp) settings = logger.settings else: logger = None settings = ASReviewSettings(model=model, n_instances=n_instances, n_queries=n_queries, n_prior_included=n_prior_included, n_prior_excluded=n_prior_excluded, query_strategy=query_strategy, balance_strategy=balance_strategy, mode=mode, data_fp=dataset) settings.from_file(config_file) model = settings.model if model in ["lstm_base", "lstm_pool"]: base_model = "RNN" else: base_model = "other" # Check if mode is valid if mode in AVAILABLE_REVIEW_CLASSES: if verbose: print(f"Start review in '{mode}' mode.") else: raise ValueError(f"Unknown mode '{mode}'.") print(f"Model: '{model}'") # if the provided file is a pickle file if is_pickle(dataset): with open(dataset, 'rb') as f: data_obj = pickle.load(f) if isinstance(data_obj, tuple) and len(data_obj) == 3: X, y, embedding_matrix = data_obj elif isinstance(data_obj, tuple) and len(data_obj) == 4: X, y, embedding_matrix, _ = data_obj else: raise ValueError("Incorrect pickle object.") else: as_data = ASReviewData.from_file(dataset) _, texts, labels = as_data.get_data() # get the model if base_model == "RNN": if embedding_fp is None: embedding_fp = Path(get_data_home(), EMBEDDING_EN["name"]).expanduser() if not embedding_fp.exists(): print("Warning: will start to download large " "embedding file in 10 seconds.") time.sleep(10) download_embedding(verbose=verbose) # create features and labels X, word_index = text_to_features(texts) y = labels embedding = load_embedding(embedding_fp, word_index=word_index) embedding_matrix = sample_embedding(embedding, word_index) elif model.lower() in ['nb', 'svc', 'svm']: from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer())]) X = text_clf.fit_transform(texts) y = labels settings.fit_kwargs = {} settings.query_kwargs = {} if base_model == 'RNN': if model == "lstm_base": model_kwargs = lstm_base_model_defaults(settings, verbose) create_lstm_model = create_lstm_base_model elif model == "lstm_pool": model_kwargs = lstm_pool_model_defaults(settings, verbose) create_lstm_model = create_lstm_pool_model else: raise ValueError(f"Unknown model {model}") settings.fit_kwargs = lstm_fit_defaults(settings, verbose) settings.query_kwargs['verbose'] = verbose # create the model model = KerasClassifier(create_lstm_model( embedding_matrix=embedding_matrix, **model_kwargs), verbose=verbose) elif model.lower() in ['nb']: from asreview.models import create_nb_model model = create_nb_model() elif model.lower() in ['svm', 'svc']: from asreview.models import create_svc_model model = create_svc_model() else: raise ValueError('Model not found.') # Pick query strategy query_fn, query_str = get_query_strategy(settings) if verbose: print(f"Query strategy: {query_str}") train_data_fn, train_method = get_balance_strategy(settings) if verbose: print(f"Using {train_method} method to obtain training data.") # Initialize the review class. if mode == "simulate": reviewer = ReviewSimulate(X, y, model=model, query_strategy=query_fn, train_data_fn=train_data_fn, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, n_prior_included=settings.n_prior_included, n_prior_excluded=settings.n_prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, logger=logger, **kwargs) elif mode == "oracle": reviewer = ReviewOracle(X, model=model, query_strategy=query_fn, as_data=as_data, train_data_fn=train_data_fn, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, logger=logger, **kwargs) elif mode == "minimal": reviewer = MinimalReview(X, model=model, query_strategy=query_fn, train_data_fn=train_data_fn, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, logger=logger, **kwargs) else: raise ValueError("Error finding mode, should never come here...") reviewer._logger.add_settings(settings) return reviewer
def __init__(self, X, y=None, model=None, query_strategy=max_sampling, train_data_fn=full_sample, n_instances=1, n_queries=1, prior_included=[], prior_excluded=[], log_file=None, fit_kwargs={}, balance_kwargs={}, query_kwargs={}, logger=None, verbose=1): super(BaseReview, self).__init__() self.X = X self.y = y if y is None: self.y = np.full(X.shape[0], NOT_AVAILABLE) # Default to Naive Bayes model if model is None: print("Warning: using naive Bayes model as default." "If you experience bad performance, read the documentation" " in order to implement a RNN based solution.") from asreview.models import create_nb_model model = create_nb_model() self.model = model self.query_strategy = query_strategy self.train_data = train_data_fn self.n_instances = n_instances self.n_queries = n_queries self.log_file = log_file self.verbose = verbose self.prior_included = prior_included self.prior_excluded = prior_excluded self.fit_kwargs = fit_kwargs self.balance_kwargs = balance_kwargs self.query_kwargs = query_kwargs self.query_i = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False self.query_kwargs["src_query_idx"] = {} if logger is None: self._logger = Logger() self.start_from_logger = False else: self._logger = logger self._prepare_with_logger() self.start_from_logger = True # Initialize learner, but don't start training yet. self.learner = ActiveLearner(estimator=self.model, query_strategy=self.query_strategy)
def __init__(self, X, y=None, model=None, query_strategy=max_sampling, train_data_fn=full_sample, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, prior_included=[], prior_excluded=[], log_file=None, fit_kwargs={}, balance_kwargs={}, query_kwargs={}, final_labels=None, verbose=1): super(BaseReview, self).__init__() self.X = X self.y = y if y is None: self.y = np.full(X.shape[0], NOT_AVAILABLE) self.y = np.array(self.y, dtype=np.int) # Default to Naive Bayes model if model is None: from asreview.models import create_nb_model model = create_nb_model() self.model = model self.query_strategy = query_strategy self.balance_strategy = train_data_fn self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.log_file = log_file self.verbose = verbose self.prior_included = prior_included self.prior_excluded = prior_excluded if prior_included is None: self.prior_included = [] if prior_excluded is None: self.prior_excluded = [] self.fit_kwargs = fit_kwargs self.balance_kwargs = balance_kwargs self.query_kwargs = query_kwargs self.query_i = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False self.query_kwargs["query_src"] = {} self.query_kwargs["current_queries"] = {} with open_logger(log_file) as logger: if not logger.is_empty(): y, train_idx, query_src, query_i = logger.review_state() self.y = y self.train_idx = train_idx self.query_kwargs["query_src"] = query_src self.query_i = query_i else: if final_labels is not None: logger.set_final_labels(final_labels) logger.set_labels(self.y) init_idx, init_labels = self._prior_knowledge() self.query_i = 0 self.train_idx = np.array([], dtype=np.int) self.classify(init_idx, init_labels, logger, method="initial") # Initialize learner, but don't start training yet. self.learner = ActiveLearner( estimator=self.model, query_strategy=self.query_strategy )