def check_lstm(monkeypatch, use_granular=False, **kwargs): monkeypatch.setattr('builtins.input', lambda _: "0") # start the review process. reviewer = get_reviewer(data_fp, mode="oracle", embedding_fp=embedding_fp, prior_included=[1, 3], prior_excluded=[2, 4], **kwargs) if use_granular: # Two loops of training and classification. reviewer.train() reviewer.log_probabilities() query_idx = reviewer.query(1) inclusions = reviewer._get_labels(query_idx) reviewer.classify(query_idx, inclusions) reviewer.train() reviewer.log_probabilities() query_idx = reviewer.query(1) inclusions = reviewer._get_labels(query_idx) reviewer.classify(query_idx, inclusions) else: reviewer.review() check_log(reviewer._logger._log_dict)
def check_model(monkeypatch=None, use_granular=False, state_file=h5_state_file, continue_from_state=False, mode="simulate", data_fp=data_fp, state_checker=check_state, prior_idx=[1, 2, 3, 4], **kwargs): if not continue_from_state: try: if state_file is not None: os.unlink(state_file) except OSError: pass if monkeypatch is not None: monkeypatch.setattr('builtins.input', lambda _: "0") # start the review process. reviewer = get_reviewer(data_fp, mode=mode, embedding_fp=embedding_fp, prior_idx=prior_idx, state_file=state_file, **kwargs) if use_granular: with open_state(state_file) as state: # Two loops of training and classification. reviewer.train() reviewer.log_probabilities(state) query_idx = reviewer.query(1) inclusions = reviewer._get_labels(query_idx) reviewer.classify(query_idx, inclusions, state) reviewer.train() reviewer.log_probabilities(state) query_idx = reviewer.query(1) inclusions = reviewer._get_labels(query_idx) reviewer.classify(query_idx, inclusions, state) else: with open_state(state_file) as state: if state_file is None: state.set_labels(reviewer.y) init_idx, init_labels = reviewer._prior_knowledge() reviewer.query_i = 0 reviewer.train_idx = np.array([], dtype=np.int) reviewer.classify(init_idx, init_labels, state, method="initial") reviewer._do_review(state) if state_file is None: print(state._state_dict) check_state(state) if state_file is not None: with open_state(state_file, read_only=True) as state: state_checker(state)
def check_model(monkeypatch=None, use_granular=False, log_file=h5_log_file, continue_from_log=False, mode="oracle", **kwargs): if not continue_from_log: try: if log_file is not None: os.unlink(log_file) except OSError: pass if monkeypatch is not None: monkeypatch.setattr('builtins.input', lambda _: "0") # start the review process. reviewer = get_reviewer(data_fp, mode=mode, embedding_fp=embedding_fp, prior_included=[1, 3], prior_excluded=[2, 4], log_file=log_file, **kwargs) if use_granular: with open_logger(log_file) as logger: # Two loops of training and classification. reviewer.train() reviewer.log_probabilities(logger) query_idx = reviewer.query(1) inclusions = reviewer._get_labels(query_idx) reviewer.classify(query_idx, inclusions, logger) reviewer.train() reviewer.log_probabilities(logger) query_idx = reviewer.query(1) inclusions = reviewer._get_labels(query_idx) reviewer.classify(query_idx, inclusions, logger) else: with open_logger(log_file) as logger: if log_file is None: logger.set_labels(reviewer.y) init_idx, init_labels = reviewer._prior_knowledge() reviewer.query_i = 0 reviewer.train_idx = np.array([], dtype=np.int) reviewer.classify(init_idx, init_labels, logger, method="initial") reviewer._do_review(logger) if log_file is None: print(logger._log_dict) check_log(logger) if log_file is not None: with open_logger(log_file, read_only=True) as logger: check_log(logger)
def test_state_continue_h5(): inter_file = os.path.join(state_dir, "test_1_inst.h5") if not os.path.isfile(inter_file): reviewer = get_reviewer( data_fp, mode="simulate", model="nb", embedding_fp=embedding_fp, prior_idx=[1, 2, 3, 4], state_file=inter_file, n_instances=1, n_queries=1) reviewer.review() copyfile(inter_file, h5_state_file) check_model(mode="simulate", model="nb", state_file=h5_state_file, continue_from_state=True, n_instances=1, n_queries=2)
def test_model_seed(): n_test = 4 seed = 192874123 last_train_idx = None for _ in range(n_test): reviewer = get_reviewer(data_fp, mode="simulate", model="rf", query_strategy="random", state_file=None, init_seed=seed, seed=seed, n_prior_excluded=1, n_prior_included=1) reviewer.review() if last_train_idx is None: last_train_idx = reviewer.train_idx assert np.all(last_train_idx == reviewer.train_idx)
def test_no_seed(): n_test_max = 100 as_data = ASReviewData.from_file(data_fp) n_priored = np.zeros(len(as_data), dtype=int) for _ in range(n_test_max): reviewer = get_reviewer(data_fp, mode="simulate", model="nb", state_file=None, init_seed=None, n_prior_excluded=1, n_prior_included=1) assert len(reviewer.start_idx) == 2 n_priored[reviewer.start_idx] += 1 if np.all(n_priored > 0): return raise ValueError(f"Error getting all priors in {n_test_max} iterations.")
def test_state_continue_json(): inter_file = Path(state_dir, "test_1_inst.json") if not inter_file.is_file(): reviewer = get_reviewer(data_fp, mode="simulate", model="nb", embedding_fp=embedding_fp, prior_idx=[1, 2, 3, 4], state_file=inter_file, n_instances=1, n_queries=1) reviewer.review() copyfile(inter_file, json_state_file) check_model(model="nb", state_file=json_state_file, continue_from_state=True, n_instances=1, n_queries=2)
def test_log_continue_h5(): inter_file = os.path.join(log_dir, "test_1_inst.h5") if not os.path.isfile(inter_file): reviewer = get_reviewer(data_fp, mode="simulate", model="nb", embedding_fp=embedding_fp, prior_included=[1, 3], prior_excluded=[2, 4], log_file=inter_file, n_instances=1, n_queries=1) reviewer.review() copyfile(inter_file, h5_log_file) check_model(mode="simulate", model="nb", log_file=h5_log_file, continue_from_log=True, n_instances=1, n_queries=2)
def test_init_seed(): base_start_idx = None n_test = 4 seeds = np.random.randint(0, 2**63, 5) for _ in range(n_test): all_start_idx = [] for seed in seeds: reviewer = get_reviewer(data_fp, mode="simulate", model="nb", state_file=None, init_seed=seed, n_prior_excluded=1, n_prior_included=1) assert len(reviewer.start_idx) == 2 all_start_idx.append(reviewer.start_idx) if base_start_idx is None: base_start_idx = all_start_idx continue assert np.all(np.array(base_start_idx) == np.array(all_start_idx))
def check_lstm(use_granular=False, **kwargs): # start the review process. reviewer = get_reviewer(data_fp, mode="simulate", embedding_fp=embedding_fp, prior_included=[1, 3], prior_excluded=[2, 4], **kwargs) if use_granular: # Two loops of training and classification. reviewer.train() reviewer.log_probabilities() query_idx = reviewer.query(1) inclusions = reviewer._get_labels(query_idx) reviewer.classify(query_idx, inclusions) reviewer.train() reviewer.log_probabilities() query_idx = reviewer.query(1) inclusions = reviewer._get_labels(query_idx) reviewer.classify(query_idx, inclusions) else: reviewer.review() check_log(reviewer._logger._log_dict)
def execute(self, param, data_name, i_run): split_param = get_split_param(param) state_file = get_state_file_name(self.trials_dir, data_name, i_run) try: os.remove(state_file) except FileNotFoundError: pass start_idx = self.get_cached_priors(data_name, i_run) reviewer = get_reviewer(data_fp_from_name(self.data_dir, data_name), mode='simulate', model=self.model_name, query_strategy=self.query_name, balance_strategy=self.balance_name, feature_extraction=self.feature_name, n_instances=self.n_instances, n_papers=self.n_papers, state_file=state_file, prior_idx=start_idx, **split_param) reviewer.review()
def test_state_continue_h5(tmpdir): inter_file = Path(STATE_DIR, "test_1_inst.h5") if not inter_file.is_file(): reviewer = get_reviewer(DATA_FP, mode="simulate", model="nb", embedding_fp=EMBEDDING_FP, prior_idx=[1, 2, 3, 4], state_file=inter_file, n_instances=1, n_queries=1) reviewer.review() # copy state file to tmp dir for changes tmp_h5_state_fp = Path(tmpdir, "tmp_state.h5") copyfile(inter_file, tmp_h5_state_fp) check_model(model="nb", state_file=tmp_h5_state_fp, continue_from_state=True, n_instances=1, n_queries=2)
def train_model(project_id, label_method=None): """Add the new labels to the review and do the modeling. It uses a lock to ensure only one model is running at the same time. Old results directories are deleted after 4 iterations. It has one argument on the CLI, which is the base project directory. """ logging.info(f"Project {project_id} - Train a new model for project") # get file locations asr_kwargs_file = get_kwargs_path(project_id) lock_file = get_lock_path(project_id) # Lock so that only one training run is running at the same time. # It doesn't lock the flask server/client. with SQLiteLock(lock_file, blocking=False, lock_name="training", project_id=project_id) as lock: # If the lock is not acquired, another training instance is running. if not lock.locked(): logging.info("Project {project_id} - " "Cannot acquire lock, other instance running.") return # Lock the current state. We want to have a consistent active state. # This does communicate with the flask backend; it prevents writing and # reading to the same files at the same time. with SQLiteLock(lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # Get the all labels since last run. If no new labels, quit. new_label_history = read_label_history(project_id) data_fp = str(get_data_file_path(project_id)) as_data = read_data(project_id) state_file = get_state_path(project_id) # collect command line arguments and pass them to the reviewer with open(asr_kwargs_file, "r") as fp: asr_kwargs = json.load(fp) asr_kwargs['state_file'] = str(state_file) reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs) with open_state(state_file) as state: old_label_history = get_label_train_history(state) diff_history = get_diff_history(new_label_history, old_label_history) if len(diff_history) == 0: logging.info( "Project {project_id} - No new labels since last run.") return query_idx = np.array([x[0] for x in diff_history], dtype=int) inclusions = np.array([x[1] for x in diff_history], dtype=int) # Classify the new labels, train and store the results. with open_state(state_file) as state: reviewer.classify(query_idx, inclusions, state, method=label_method) reviewer.train() reviewer.log_probabilities(state) new_query_idx = reviewer.query(reviewer.n_pool()).tolist() reviewer.log_current_query(state) proba = state.pred_proba.tolist() with SQLiteLock(lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: current_pool = read_pool(project_id) in_current_pool = np.zeros(len(as_data)) in_current_pool[current_pool] = 1 new_pool = [x for x in new_query_idx if in_current_pool[x]] write_pool(project_id, new_pool) write_proba(project_id, proba)
def test_dataset_not_found(): reviewer = get_reviewer("doesnt_exist.csv", mode="simulate") reviewer.review()
def test_dataset_from_benchmark_group(): reviewer = get_reviewer("benchmark:Cohen_2006_ACEInhibitors", mode="simulate") reviewer.review()
def test_dataset_from_url(): reviewer = get_reviewer(DATA_FP_URL, mode="simulate") reviewer.review()
def train_model(project_id, label_method=None): """Add the new labels to the review and do the modeling. It uses a lock to ensure only one model is running at the same time. Old results directories are deleted after 4 iterations. It has one argument on the CLI, which is the base project directory. """ logging.info(f"Project {project_id} - Train a new model for project") # get file locations asr_kwargs_file = get_kwargs_path(project_id) lock_file = get_lock_path(project_id) # Lock so that only one training run is running at the same time. # It doesn't lock the flask server/client. with SQLiteLock( lock_file, blocking=False, lock_name="training", project_id=project_id) as lock: # If the lock is not acquired, another training instance is running. if not lock.locked(): logging.info("Project {project_id} - " "Cannot acquire lock, other instance running.") return # Lock the current state. We want to have a consistent active state. # This does communicate with the flask backend; it prevents writing and # reading to the same files at the same time. with SQLiteLock( lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # Get the all labels since last run. If no new labels, quit. new_label_history = read_label_history(project_id) data_fp = str(get_data_file_path(project_id)) as_data = read_data(project_id) state_file = get_state_path(project_id) # collect command line arguments and pass them to the reviewer with open(asr_kwargs_file, "r") as fp: asr_kwargs = json.load(fp) try: del asr_kwargs["abstract_only"] except KeyError: pass asr_kwargs['state_file'] = str(state_file) reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs) with open_state(state_file) as state: old_label_history = _get_label_train_history(state) diff_history = _get_diff_history(new_label_history, old_label_history) if len(diff_history) == 0: logging.info( "Project {project_id} - No new labels since last run.") return query_record_ids = np.array([x[0] for x in diff_history], dtype=int) inclusions = np.array([x[1] for x in diff_history], dtype=int) query_idx = convert_id_to_idx(as_data, query_record_ids) # Classify the new labels, train and store the results. with open_state(state_file) as state: reviewer.classify( query_idx, inclusions, state, method=label_method) reviewer.train() reviewer.log_probabilities(state) new_query_idx = reviewer.query(reviewer.n_pool()).tolist() reviewer.log_current_query(state) # write the proba to a pandas dataframe with record_ids as index proba = pd.DataFrame( {"proba": state.pred_proba.tolist()}, index=pd.Index(as_data.record_ids, name="record_id") ) # update the pool and output the proba's # important: pool is sorted on query with SQLiteLock( lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # read the pool current_pool = read_pool(project_id) # diff pool and new_query_ind current_pool_idx = convert_id_to_idx(as_data, current_pool) current_pool_idx = frozenset(current_pool_idx) new_pool_idx = [x for x in new_query_idx if x in current_pool_idx] # convert new_pool_idx back to record_ids new_pool = convert_idx_to_id(as_data, new_pool_idx) # write the pool and proba write_pool(project_id, new_pool) write_proba(project_id, proba)