def test_query(query_strategy, n_features=50, n_sample=100, n_instances_list=[0, 1, 5, 50], n_train_idx=[0, 1, 5, 50]): classifier = get_model("rf") if query_strategy == "cluster": data_fp = os.path.join("test", "demo_data", "generic.csv") texts = ASReviewData.from_file(data_fp).texts while len(texts) < n_features: texts = np.append(texts, texts) print(len(texts)) # texts.extend(texts) texts = texts[:n_features] query_model = get_query_model(query_strategy, texts=texts, update_interval=None, cluster_size=int(n_sample / 3)) assert isinstance(query_model.param, dict) else: query_model = get_query_model(query_strategy) X = np.random.rand(n_sample, n_features) y = np.concatenate((np.zeros(n_sample // 2), np.ones(n_sample // 2)), axis=0) print(X.shape, y.shape) order = np.random.permutation(n_sample) print(order.shape) X = X[order] y = y[order] sources = query_strategy.split('_') classifier.fit(X, y) assert isinstance(query_model.param, dict) assert query_model.name == query_strategy for n_instances in n_instances_list: for n_train in n_train_idx: shared = {"query_src": {}, "current_queries": {}} train_idx = np.random.choice(np.arange(n_sample), n_train, replace=False) pool_idx = np.delete(np.arange(n_sample), train_idx) query_idx, X_query = query_model.query(X, classifier, pool_idx, n_instances, shared) check_integrity(query_idx, X_query, X, pool_idx, shared, n_instances, sources)
def test_features(feature_extraction, split_ta): embedding_fp = os.path.join("test", "demo_data", "generic.vec") data_fp = os.path.join("test", "demo_data", "generic.csv") as_data = ASReviewData.from_file(data_fp) texts = as_data.texts if feature_extraction.startswith("embedding-"): model = get_feature_model(feature_extraction, split_ta=split_ta, embedding_fp=embedding_fp) else: model = get_feature_model(feature_extraction, split_ta=split_ta) X = model.fit_transform(texts, titles=as_data.title, abstracts=as_data.abstract) assert X.shape[0] == len(as_data.title) assert X.shape[1] > 0 assert isinstance(model.param, dict) assert model.name == feature_extraction
def test_csv_write_data(): fp_in = Path("test", "demo_data", "csv_example_with_labels.csv") fp_out = Path("test", "out_data", "csv_out_example.csv") asr_data = ASReviewData.from_file(fp_in) asr_data.to_csv(fp_out, labels=[0, 1, 0, 1, 0, 1])
def get_reviewer(dataset, mode='oracle', model=DEFAULT_MODEL, query_strategy=DEFAULT_QUERY_STRATEGY, balance_strategy=DEFAULT_BALANCE_STRATEGY, n_instances=DEFAULT_N_INSTANCES, n_papers=None, n_queries=None, embedding_fp=None, verbose=0, prior_included=None, prior_excluded=None, n_prior_included=DEFAULT_N_PRIOR_INCLUDED, n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED, config_file=None, log_file=None, model_param=None, query_param=None, balance_param=None, abstract_only=False, **kwargs): """ Get a review object from arguments. See __main__.py for a description Of the arguments. """ # Find the URL of the datasets if the dataset is an example dataset. if dataset in DEMO_DATASETS.keys(): dataset = DEMO_DATASETS[dataset] cli_settings = ASReviewSettings(model=model, n_instances=n_instances, n_queries=n_queries, n_papers=n_papers, n_prior_included=n_prior_included, n_prior_excluded=n_prior_excluded, query_strategy=query_strategy, balance_strategy=balance_strategy, mode=mode, data_fp=dataset, abstract_only=abstract_only) cli_settings.from_file(config_file) if log_file is not None: with open_logger(log_file) as logger: if logger.is_empty(): logger.add_settings(cli_settings) settings = logger.settings else: settings = cli_settings logger = None if n_queries is not None: settings.n_queries = n_queries if n_papers is not None: settings.n_papers = n_papers if model_param is not None: settings.model_param = model_param if query_param is not None: settings.query_param = query_param if balance_param is not None: settings.balance_param = balance_param model = settings.model # Check if mode is valid if mode in AVAILABLE_REVIEW_CLASSES: logging.info(f"Start review in '{mode}' mode.") else: raise ValueError(f"Unknown mode '{mode}'.") logging.debug(settings) as_data = ASReviewData.from_file(dataset, abstract_only=settings.abstract_only) _, texts, labels = as_data.get_data() if as_data.final_labels is not None: with open_logger(log_file) as logger: logger.set_final_labels(as_data.final_labels) model_class = get_model_class(model) model_inst = model_class(param=settings.model_param, embedding_fp=embedding_fp) X, y = model_inst.get_Xy(texts, labels) model_fn = model_inst.model() settings.fit_kwargs = model_inst.fit_kwargs() settings.query_kwargs = {} # Pick query strategy query_fn, query_str = get_query_with_settings(settings) logging.info(f"Query strategy: {query_str}") train_data_fn, train_method = get_balance_with_settings(settings) logging.info(f"Using {train_method} method to obtain training data.") # Initialize the review class. if mode == "simulate": reviewer = ReviewSimulate(X, y, model=model_fn, query_strategy=query_fn, train_data_fn=train_data_fn, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, n_prior_included=settings.n_prior_included, n_prior_excluded=settings.n_prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, log_file=log_file, final_labels=as_data.final_labels, **kwargs) elif mode == "oracle": reviewer = ReviewOracle(X, model=model_fn, query_strategy=query_fn, as_data=as_data, train_data_fn=train_data_fn, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, log_file=log_file, **kwargs) elif mode == "minimal": reviewer = MinimalReview(X, model=model_fn, query_strategy=query_fn, train_data_fn=train_data_fn, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, log_file=log_file, **kwargs) else: raise ValueError("Error finding mode, should never come here...") return reviewer
def get_reviewer(dataset, mode='oracle', model=DEFAULT_MODEL, query_strategy=DEFAULT_QUERY_STRATEGY, balance_strategy=DEFAULT_BALANCE_STRATEGY, feature_extraction=DEFAULT_FEATURE_EXTRACTION, n_instances=DEFAULT_N_INSTANCES, n_papers=None, n_queries=None, embedding_fp=None, verbose=0, prior_included=None, prior_excluded=None, n_prior_included=DEFAULT_N_PRIOR_INCLUDED, n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED, config_file=None, log_file=None, model_param=None, query_param=None, balance_param=None, feature_param=None, abstract_only=False, extra_dataset=[], **kwargs ): """ Get a review object from arguments. See __main__.py for a description Of the arguments. """ # Find the URL of the datasets if the dataset is an example dataset. if dataset in DEMO_DATASETS.keys(): dataset = DEMO_DATASETS[dataset] cli_settings = ASReviewSettings( model=model, n_instances=n_instances, n_queries=n_queries, n_papers=n_papers, n_prior_included=n_prior_included, n_prior_excluded=n_prior_excluded, query_strategy=query_strategy, balance_strategy=balance_strategy, feature_extraction=feature_extraction, mode=mode, data_fp=dataset, abstract_only=abstract_only) cli_settings.from_file(config_file) if log_file is not None: with open_logger(log_file) as logger: if logger.is_empty(): logger.add_settings(cli_settings) settings = logger.settings else: settings = cli_settings logger = None if n_queries is not None: settings.n_queries = n_queries if n_papers is not None: settings.n_papers = n_papers if model_param is not None: settings.model_param = model_param if query_param is not None: settings.query_param = query_param if balance_param is not None: settings.balance_param = balance_param if feature_param is not None: settings.feature_param = feature_param # Check if mode is valid if mode in AVAILABLE_REVIEW_CLASSES: logging.info(f"Start review in '{mode}' mode.") else: raise ValueError(f"Unknown mode '{mode}'.") logging.debug(settings) as_data = ASReviewData.from_file(dataset, extra_dataset=extra_dataset, abstract_only=settings.abstract_only) texts = as_data.texts y = as_data.labels data_prior_included, data_prior_excluded = as_data.get_priors() if len(data_prior_included) != 0: if prior_included is None: prior_included = [] prior_included.extend(data_prior_included.tolist()) if len(data_prior_excluded) != 0: if prior_excluded is None: prior_excluded = [] prior_excluded.extend(data_prior_excluded.tolist()) if as_data.final_labels is not None: with open_logger(log_file) as logger: logger.set_final_labels(as_data.final_labels) train_model = get_model(settings.model, **settings.model_param) query_model = get_query_model(settings.query_strategy, **settings.query_param) balance_model = get_balance_model(settings.balance_strategy, **settings.balance_param) feature_model = get_feature_model(settings.feature_extraction, **settings.feature_param) X = feature_model.fit_transform(texts, as_data.title, as_data.abstract) if train_model.name.startswith("lstm-"): train_model.embedding_matrix = feature_model.get_embedding_matrix( texts, embedding_fp) # Initialize the review class. if mode == "simulate": reviewer = ReviewSimulate( X, y, model=train_model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, n_prior_included=settings.n_prior_included, n_prior_excluded=settings.n_prior_excluded, log_file=log_file, final_labels=as_data.final_labels, data_fp=dataset, **kwargs) elif mode == "oracle": reviewer = ReviewOracle( X, model=train_model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, as_data=as_data, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, log_file=log_file, data_fp=dataset, **kwargs) elif mode == "minimal": reviewer = MinimalReview( X, model=model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, log_file=log_file, data_fp=dataset, **kwargs) else: raise ValueError("Error finding mode, should never come here...") return reviewer
for text in texts: n_numbers.append(sum(c.isdigit() for c in text)) return np.array(n_numbers, dtype=int) if __name__ == "__main__": data_dir = sys.argv[1] data_fp = sys.argv[2] analysis = Analysis.from_dir(data_dir) ttd = analysis.avg_time_to_discovery() ttd_order = sorted(ttd, key=lambda x: ttd[x]) # for idx in ttd_order: # print(f"{idx}: {ttd[idx]}") as_data = ASReviewData.from_file(data_fp) n_abstract_missing = 0 n_missing_included = 0 for i, abstract in enumerate(as_data.abstract): if len(abstract) < 10: n_abstract_missing += 1 if as_data.labels[i] == 1: n_missing_included += 1 n_paper = len(as_data.abstract) n_included = np.sum(as_data.labels) print(f"Number of abstracts missing: {n_abstract_missing}/{n_paper}") print( f"Number of included abstracts missing: {n_missing_included}/{n_included}" ) for idx in ttd_order[-10:]:
def get_reviewer(dataset, mode='oracle', model=DEFAULT_MODEL, query_strategy=DEFAULT_QUERY_STRATEGY, balance_strategy=DEFAULT_BALANCE_STRATEGY, n_instances=DEFAULT_N_INSTANCES, n_queries=1, embedding_fp=None, verbose=1, prior_included=None, prior_excluded=None, n_prior_included=DEFAULT_N_PRIOR_INCLUDED, n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED, config_file=None, src_log_fp=None, **kwargs): # Find the URL of the datasets if the dataset is an example dataset. if dataset in DEMO_DATASETS.keys(): dataset = DEMO_DATASETS[dataset] if src_log_fp is not None: logger = Logger(log_fp=src_log_fp) settings = logger.settings else: logger = None settings = ASReviewSettings(model=model, n_instances=n_instances, n_queries=n_queries, n_prior_included=n_prior_included, n_prior_excluded=n_prior_excluded, query_strategy=query_strategy, balance_strategy=balance_strategy, mode=mode, data_fp=dataset) settings.from_file(config_file) model = settings.model if model in ["lstm_base", "lstm_pool"]: base_model = "RNN" else: base_model = "other" # Check if mode is valid if mode in AVAILABLE_REVIEW_CLASSES: if verbose: print(f"Start review in '{mode}' mode.") else: raise ValueError(f"Unknown mode '{mode}'.") print(f"Model: '{model}'") # if the provided file is a pickle file if is_pickle(dataset): with open(dataset, 'rb') as f: data_obj = pickle.load(f) if isinstance(data_obj, tuple) and len(data_obj) == 3: X, y, embedding_matrix = data_obj elif isinstance(data_obj, tuple) and len(data_obj) == 4: X, y, embedding_matrix, _ = data_obj else: raise ValueError("Incorrect pickle object.") else: as_data = ASReviewData.from_file(dataset) _, texts, labels = as_data.get_data() # get the model if base_model == "RNN": if embedding_fp is None: embedding_fp = Path(get_data_home(), EMBEDDING_EN["name"]).expanduser() if not embedding_fp.exists(): print("Warning: will start to download large " "embedding file in 10 seconds.") time.sleep(10) download_embedding(verbose=verbose) # create features and labels X, word_index = text_to_features(texts) y = labels embedding = load_embedding(embedding_fp, word_index=word_index) embedding_matrix = sample_embedding(embedding, word_index) elif model.lower() in ['nb', 'svc', 'svm']: from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer())]) X = text_clf.fit_transform(texts) y = labels settings.fit_kwargs = {} settings.query_kwargs = {} if base_model == 'RNN': if model == "lstm_base": model_kwargs = lstm_base_model_defaults(settings, verbose) create_lstm_model = create_lstm_base_model elif model == "lstm_pool": model_kwargs = lstm_pool_model_defaults(settings, verbose) create_lstm_model = create_lstm_pool_model else: raise ValueError(f"Unknown model {model}") settings.fit_kwargs = lstm_fit_defaults(settings, verbose) settings.query_kwargs['verbose'] = verbose # create the model model = KerasClassifier(create_lstm_model( embedding_matrix=embedding_matrix, **model_kwargs), verbose=verbose) elif model.lower() in ['nb']: from asreview.models import create_nb_model model = create_nb_model() elif model.lower() in ['svm', 'svc']: from asreview.models import create_svc_model model = create_svc_model() else: raise ValueError('Model not found.') # Pick query strategy query_fn, query_str = get_query_strategy(settings) if verbose: print(f"Query strategy: {query_str}") train_data_fn, train_method = get_balance_strategy(settings) if verbose: print(f"Using {train_method} method to obtain training data.") # Initialize the review class. if mode == "simulate": reviewer = ReviewSimulate(X, y, model=model, query_strategy=query_fn, train_data_fn=train_data_fn, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, n_prior_included=settings.n_prior_included, n_prior_excluded=settings.n_prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, logger=logger, **kwargs) elif mode == "oracle": reviewer = ReviewOracle(X, model=model, query_strategy=query_fn, as_data=as_data, train_data_fn=train_data_fn, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, logger=logger, **kwargs) elif mode == "minimal": reviewer = MinimalReview(X, model=model, query_strategy=query_fn, train_data_fn=train_data_fn, n_instances=settings.n_instances, n_queries=settings.n_queries, verbose=verbose, prior_included=prior_included, prior_excluded=prior_excluded, fit_kwargs=settings.fit_kwargs, balance_kwargs=settings.balance_kwargs, query_kwargs=settings.query_kwargs, logger=logger, **kwargs) else: raise ValueError("Error finding mode, should never come here...") reviewer._logger.add_settings(settings) return reviewer
def test_csv_write_data(): fp_in = Path("test", "demo_data", "generic_labels.csv") fp_out = Path("test", "out_data", "generic_out.csv") asr_data = ASReviewData.from_file(fp_in) asr_data.to_csv(fp_out, labels=[0, 1, 0, 1, 0, 1])