def test_query(query_strategy,
               n_features=50,
               n_sample=100,
               n_instances_list=[0, 1, 5, 50],
               n_train_idx=[0, 1, 5, 50]):
    classifier = get_model("rf")
    if query_strategy == "cluster":
        data_fp = os.path.join("test", "demo_data", "generic.csv")
        texts = ASReviewData.from_file(data_fp).texts
        while len(texts) < n_features:
            texts = np.append(texts, texts)
            print(len(texts))


#             texts.extend(texts)
        texts = texts[:n_features]
        query_model = get_query_model(query_strategy,
                                      texts=texts,
                                      update_interval=None,
                                      cluster_size=int(n_sample / 3))
        assert isinstance(query_model.param, dict)
    else:
        query_model = get_query_model(query_strategy)
    X = np.random.rand(n_sample, n_features)

    y = np.concatenate((np.zeros(n_sample // 2), np.ones(n_sample // 2)),
                       axis=0)
    print(X.shape, y.shape)
    order = np.random.permutation(n_sample)
    print(order.shape)
    X = X[order]
    y = y[order]
    sources = query_strategy.split('_')

    classifier.fit(X, y)

    assert isinstance(query_model.param, dict)
    assert query_model.name == query_strategy

    for n_instances in n_instances_list:
        for n_train in n_train_idx:
            shared = {"query_src": {}, "current_queries": {}}
            train_idx = np.random.choice(np.arange(n_sample),
                                         n_train,
                                         replace=False)
            pool_idx = np.delete(np.arange(n_sample), train_idx)
            query_idx, X_query = query_model.query(X, classifier, pool_idx,
                                                   n_instances, shared)
            check_integrity(query_idx, X_query, X, pool_idx, shared,
                            n_instances, sources)
Пример #2
0
def test_features(feature_extraction, split_ta):
    embedding_fp = os.path.join("test", "demo_data", "generic.vec")
    data_fp = os.path.join("test", "demo_data", "generic.csv")

    as_data = ASReviewData.from_file(data_fp)
    texts = as_data.texts
    if feature_extraction.startswith("embedding-"):
        model = get_feature_model(feature_extraction,
                                  split_ta=split_ta,
                                  embedding_fp=embedding_fp)
    else:
        model = get_feature_model(feature_extraction, split_ta=split_ta)
    X = model.fit_transform(texts,
                            titles=as_data.title,
                            abstracts=as_data.abstract)

    assert X.shape[0] == len(as_data.title)
    assert X.shape[1] > 0
    assert isinstance(model.param, dict)
    assert model.name == feature_extraction
Пример #3
0
def test_csv_write_data():
    fp_in = Path("test", "demo_data", "csv_example_with_labels.csv")
    fp_out = Path("test", "out_data", "csv_out_example.csv")
    asr_data = ASReviewData.from_file(fp_in)
    asr_data.to_csv(fp_out, labels=[0, 1, 0, 1, 0, 1])
Пример #4
0
def get_reviewer(dataset,
                 mode='oracle',
                 model=DEFAULT_MODEL,
                 query_strategy=DEFAULT_QUERY_STRATEGY,
                 balance_strategy=DEFAULT_BALANCE_STRATEGY,
                 n_instances=DEFAULT_N_INSTANCES,
                 n_papers=None,
                 n_queries=None,
                 embedding_fp=None,
                 verbose=0,
                 prior_included=None,
                 prior_excluded=None,
                 n_prior_included=DEFAULT_N_PRIOR_INCLUDED,
                 n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED,
                 config_file=None,
                 log_file=None,
                 model_param=None,
                 query_param=None,
                 balance_param=None,
                 abstract_only=False,
                 **kwargs):
    """ Get a review object from arguments. See __main__.py for a description
        Of the arguments.
    """

    # Find the URL of the datasets if the dataset is an example dataset.
    if dataset in DEMO_DATASETS.keys():
        dataset = DEMO_DATASETS[dataset]

    cli_settings = ASReviewSettings(model=model,
                                    n_instances=n_instances,
                                    n_queries=n_queries,
                                    n_papers=n_papers,
                                    n_prior_included=n_prior_included,
                                    n_prior_excluded=n_prior_excluded,
                                    query_strategy=query_strategy,
                                    balance_strategy=balance_strategy,
                                    mode=mode,
                                    data_fp=dataset,
                                    abstract_only=abstract_only)
    cli_settings.from_file(config_file)

    if log_file is not None:
        with open_logger(log_file) as logger:
            if logger.is_empty():
                logger.add_settings(cli_settings)
            settings = logger.settings
    else:
        settings = cli_settings
        logger = None

    if n_queries is not None:
        settings.n_queries = n_queries
    if n_papers is not None:
        settings.n_papers = n_papers
    if model_param is not None:
        settings.model_param = model_param
    if query_param is not None:
        settings.query_param = query_param
    if balance_param is not None:
        settings.balance_param = balance_param

    model = settings.model

    # Check if mode is valid
    if mode in AVAILABLE_REVIEW_CLASSES:
        logging.info(f"Start review in '{mode}' mode.")
    else:
        raise ValueError(f"Unknown mode '{mode}'.")
    logging.debug(settings)

    as_data = ASReviewData.from_file(dataset,
                                     abstract_only=settings.abstract_only)
    _, texts, labels = as_data.get_data()

    if as_data.final_labels is not None:
        with open_logger(log_file) as logger:
            logger.set_final_labels(as_data.final_labels)

    model_class = get_model_class(model)
    model_inst = model_class(param=settings.model_param,
                             embedding_fp=embedding_fp)
    X, y = model_inst.get_Xy(texts, labels)

    model_fn = model_inst.model()
    settings.fit_kwargs = model_inst.fit_kwargs()

    settings.query_kwargs = {}
    # Pick query strategy
    query_fn, query_str = get_query_with_settings(settings)
    logging.info(f"Query strategy: {query_str}")

    train_data_fn, train_method = get_balance_with_settings(settings)
    logging.info(f"Using {train_method} method to obtain training data.")

    # Initialize the review class.
    if mode == "simulate":
        reviewer = ReviewSimulate(X,
                                  y,
                                  model=model_fn,
                                  query_strategy=query_fn,
                                  train_data_fn=train_data_fn,
                                  n_papers=settings.n_papers,
                                  n_instances=settings.n_instances,
                                  n_queries=settings.n_queries,
                                  verbose=verbose,
                                  prior_included=prior_included,
                                  prior_excluded=prior_excluded,
                                  n_prior_included=settings.n_prior_included,
                                  n_prior_excluded=settings.n_prior_excluded,
                                  fit_kwargs=settings.fit_kwargs,
                                  balance_kwargs=settings.balance_kwargs,
                                  query_kwargs=settings.query_kwargs,
                                  log_file=log_file,
                                  final_labels=as_data.final_labels,
                                  **kwargs)
    elif mode == "oracle":
        reviewer = ReviewOracle(X,
                                model=model_fn,
                                query_strategy=query_fn,
                                as_data=as_data,
                                train_data_fn=train_data_fn,
                                n_papers=settings.n_papers,
                                n_instances=settings.n_instances,
                                n_queries=settings.n_queries,
                                verbose=verbose,
                                prior_included=prior_included,
                                prior_excluded=prior_excluded,
                                fit_kwargs=settings.fit_kwargs,
                                balance_kwargs=settings.balance_kwargs,
                                query_kwargs=settings.query_kwargs,
                                log_file=log_file,
                                **kwargs)
    elif mode == "minimal":
        reviewer = MinimalReview(X,
                                 model=model_fn,
                                 query_strategy=query_fn,
                                 train_data_fn=train_data_fn,
                                 n_papers=settings.n_papers,
                                 n_instances=settings.n_instances,
                                 n_queries=settings.n_queries,
                                 verbose=verbose,
                                 prior_included=prior_included,
                                 prior_excluded=prior_excluded,
                                 fit_kwargs=settings.fit_kwargs,
                                 balance_kwargs=settings.balance_kwargs,
                                 query_kwargs=settings.query_kwargs,
                                 log_file=log_file,
                                 **kwargs)
    else:
        raise ValueError("Error finding mode, should never come here...")

    return reviewer
def get_reviewer(dataset,
                 mode='oracle',
                 model=DEFAULT_MODEL,
                 query_strategy=DEFAULT_QUERY_STRATEGY,
                 balance_strategy=DEFAULT_BALANCE_STRATEGY,
                 feature_extraction=DEFAULT_FEATURE_EXTRACTION,
                 n_instances=DEFAULT_N_INSTANCES,
                 n_papers=None,
                 n_queries=None,
                 embedding_fp=None,
                 verbose=0,
                 prior_included=None,
                 prior_excluded=None,
                 n_prior_included=DEFAULT_N_PRIOR_INCLUDED,
                 n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED,
                 config_file=None,
                 log_file=None,
                 model_param=None,
                 query_param=None,
                 balance_param=None,
                 feature_param=None,
                 abstract_only=False,
                 extra_dataset=[],
                 **kwargs
                 ):
    """ Get a review object from arguments. See __main__.py for a description
        Of the arguments.
    """

    # Find the URL of the datasets if the dataset is an example dataset.
    if dataset in DEMO_DATASETS.keys():
        dataset = DEMO_DATASETS[dataset]

    cli_settings = ASReviewSettings(
        model=model, n_instances=n_instances, n_queries=n_queries,
        n_papers=n_papers, n_prior_included=n_prior_included,
        n_prior_excluded=n_prior_excluded, query_strategy=query_strategy,
        balance_strategy=balance_strategy,
        feature_extraction=feature_extraction,
        mode=mode, data_fp=dataset,
        abstract_only=abstract_only)
    cli_settings.from_file(config_file)

    if log_file is not None:
        with open_logger(log_file) as logger:
            if logger.is_empty():
                logger.add_settings(cli_settings)
            settings = logger.settings
    else:
        settings = cli_settings
        logger = None

    if n_queries is not None:
        settings.n_queries = n_queries
    if n_papers is not None:
        settings.n_papers = n_papers

    if model_param is not None:
        settings.model_param = model_param
    if query_param is not None:
        settings.query_param = query_param
    if balance_param is not None:
        settings.balance_param = balance_param
    if feature_param is not None:
        settings.feature_param = feature_param

    # Check if mode is valid
    if mode in AVAILABLE_REVIEW_CLASSES:
        logging.info(f"Start review in '{mode}' mode.")
    else:
        raise ValueError(f"Unknown mode '{mode}'.")
    logging.debug(settings)

    as_data = ASReviewData.from_file(dataset, extra_dataset=extra_dataset,
                                     abstract_only=settings.abstract_only)
    texts = as_data.texts
    y = as_data.labels

    data_prior_included, data_prior_excluded = as_data.get_priors()
    if len(data_prior_included) != 0:
        if prior_included is None:
            prior_included = []
        prior_included.extend(data_prior_included.tolist())
    if len(data_prior_excluded) != 0:
        if prior_excluded is None:
            prior_excluded = []
        prior_excluded.extend(data_prior_excluded.tolist())

    if as_data.final_labels is not None:
        with open_logger(log_file) as logger:
            logger.set_final_labels(as_data.final_labels)

    train_model = get_model(settings.model, **settings.model_param)
    query_model = get_query_model(settings.query_strategy,
                                  **settings.query_param)
    balance_model = get_balance_model(settings.balance_strategy,
                                      **settings.balance_param)
    feature_model = get_feature_model(settings.feature_extraction,
                                      **settings.feature_param)

    X = feature_model.fit_transform(texts, as_data.title, as_data.abstract)

    if train_model.name.startswith("lstm-"):
        train_model.embedding_matrix = feature_model.get_embedding_matrix(
            texts, embedding_fp)

    # Initialize the review class.
    if mode == "simulate":
        reviewer = ReviewSimulate(
            X, y,
            model=train_model,
            query_model=query_model,
            balance_model=balance_model,
            feature_model=feature_model,
            n_papers=settings.n_papers,
            n_instances=settings.n_instances,
            n_queries=settings.n_queries,
            verbose=verbose,
            prior_included=prior_included,
            prior_excluded=prior_excluded,
            n_prior_included=settings.n_prior_included,
            n_prior_excluded=settings.n_prior_excluded,
            log_file=log_file,
            final_labels=as_data.final_labels,
            data_fp=dataset,
            **kwargs)
    elif mode == "oracle":
        reviewer = ReviewOracle(
            X,
            model=train_model,
            query_model=query_model,
            balance_model=balance_model,
            feature_model=feature_model,
            as_data=as_data,
            n_papers=settings.n_papers,
            n_instances=settings.n_instances,
            n_queries=settings.n_queries,
            verbose=verbose,
            prior_included=prior_included,
            prior_excluded=prior_excluded,
            log_file=log_file,
            data_fp=dataset,
            **kwargs)
    elif mode == "minimal":
        reviewer = MinimalReview(
            X,
            model=model,
            query_model=query_model,
            balance_model=balance_model,
            feature_model=feature_model,
            n_papers=settings.n_papers,
            n_instances=settings.n_instances,
            n_queries=settings.n_queries,
            verbose=verbose,
            prior_included=prior_included,
            prior_excluded=prior_excluded,
            log_file=log_file,
            data_fp=dataset,
            **kwargs)
    else:
        raise ValueError("Error finding mode, should never come here...")

    return reviewer
Пример #6
0
    for text in texts:
        n_numbers.append(sum(c.isdigit() for c in text))
    return np.array(n_numbers, dtype=int)


if __name__ == "__main__":
    data_dir = sys.argv[1]
    data_fp = sys.argv[2]

    analysis = Analysis.from_dir(data_dir)
    ttd = analysis.avg_time_to_discovery()
    ttd_order = sorted(ttd, key=lambda x: ttd[x])
    #     for idx in ttd_order:
    #         print(f"{idx}: {ttd[idx]}")

    as_data = ASReviewData.from_file(data_fp)
    n_abstract_missing = 0
    n_missing_included = 0
    for i, abstract in enumerate(as_data.abstract):
        if len(abstract) < 10:
            n_abstract_missing += 1
            if as_data.labels[i] == 1:
                n_missing_included += 1

    n_paper = len(as_data.abstract)
    n_included = np.sum(as_data.labels)
    print(f"Number of abstracts missing: {n_abstract_missing}/{n_paper}")
    print(
        f"Number of included abstracts missing: {n_missing_included}/{n_included}"
    )
    for idx in ttd_order[-10:]:
def get_reviewer(dataset,
                 mode='oracle',
                 model=DEFAULT_MODEL,
                 query_strategy=DEFAULT_QUERY_STRATEGY,
                 balance_strategy=DEFAULT_BALANCE_STRATEGY,
                 n_instances=DEFAULT_N_INSTANCES,
                 n_queries=1,
                 embedding_fp=None,
                 verbose=1,
                 prior_included=None,
                 prior_excluded=None,
                 n_prior_included=DEFAULT_N_PRIOR_INCLUDED,
                 n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED,
                 config_file=None,
                 src_log_fp=None,
                 **kwargs):

    # Find the URL of the datasets if the dataset is an example dataset.
    if dataset in DEMO_DATASETS.keys():
        dataset = DEMO_DATASETS[dataset]

    if src_log_fp is not None:
        logger = Logger(log_fp=src_log_fp)
        settings = logger.settings
    else:
        logger = None
        settings = ASReviewSettings(model=model,
                                    n_instances=n_instances,
                                    n_queries=n_queries,
                                    n_prior_included=n_prior_included,
                                    n_prior_excluded=n_prior_excluded,
                                    query_strategy=query_strategy,
                                    balance_strategy=balance_strategy,
                                    mode=mode,
                                    data_fp=dataset)

        settings.from_file(config_file)
    model = settings.model

    if model in ["lstm_base", "lstm_pool"]:
        base_model = "RNN"
    else:
        base_model = "other"

    # Check if mode is valid
    if mode in AVAILABLE_REVIEW_CLASSES:
        if verbose:
            print(f"Start review in '{mode}' mode.")
    else:
        raise ValueError(f"Unknown mode '{mode}'.")
    print(f"Model: '{model}'")

    # if the provided file is a pickle file
    if is_pickle(dataset):
        with open(dataset, 'rb') as f:
            data_obj = pickle.load(f)
        if isinstance(data_obj, tuple) and len(data_obj) == 3:
            X, y, embedding_matrix = data_obj
        elif isinstance(data_obj, tuple) and len(data_obj) == 4:
            X, y, embedding_matrix, _ = data_obj
        else:
            raise ValueError("Incorrect pickle object.")
    else:
        as_data = ASReviewData.from_file(dataset)
        _, texts, labels = as_data.get_data()

        # get the model
        if base_model == "RNN":

            if embedding_fp is None:
                embedding_fp = Path(get_data_home(),
                                    EMBEDDING_EN["name"]).expanduser()

                if not embedding_fp.exists():
                    print("Warning: will start to download large "
                          "embedding file in 10 seconds.")
                    time.sleep(10)
                    download_embedding(verbose=verbose)

            # create features and labels
            X, word_index = text_to_features(texts)
            y = labels
            embedding = load_embedding(embedding_fp, word_index=word_index)
            embedding_matrix = sample_embedding(embedding, word_index)

        elif model.lower() in ['nb', 'svc', 'svm']:
            from sklearn.pipeline import Pipeline
            from sklearn.feature_extraction.text import TfidfTransformer
            from sklearn.feature_extraction.text import CountVectorizer

            text_clf = Pipeline([('vect', CountVectorizer()),
                                 ('tfidf', TfidfTransformer())])

            X = text_clf.fit_transform(texts)
            y = labels

    settings.fit_kwargs = {}
    settings.query_kwargs = {}

    if base_model == 'RNN':
        if model == "lstm_base":
            model_kwargs = lstm_base_model_defaults(settings, verbose)
            create_lstm_model = create_lstm_base_model
        elif model == "lstm_pool":
            model_kwargs = lstm_pool_model_defaults(settings, verbose)
            create_lstm_model = create_lstm_pool_model
        else:
            raise ValueError(f"Unknown model {model}")

        settings.fit_kwargs = lstm_fit_defaults(settings, verbose)
        settings.query_kwargs['verbose'] = verbose
        # create the model
        model = KerasClassifier(create_lstm_model(
            embedding_matrix=embedding_matrix, **model_kwargs),
                                verbose=verbose)

    elif model.lower() in ['nb']:
        from asreview.models import create_nb_model

        model = create_nb_model()

    elif model.lower() in ['svm', 'svc']:
        from asreview.models import create_svc_model

        model = create_svc_model()
    else:
        raise ValueError('Model not found.')

    # Pick query strategy
    query_fn, query_str = get_query_strategy(settings)
    if verbose:
        print(f"Query strategy: {query_str}")

    train_data_fn, train_method = get_balance_strategy(settings)
    if verbose:
        print(f"Using {train_method} method to obtain training data.")

    # Initialize the review class.
    if mode == "simulate":
        reviewer = ReviewSimulate(X,
                                  y,
                                  model=model,
                                  query_strategy=query_fn,
                                  train_data_fn=train_data_fn,
                                  n_instances=settings.n_instances,
                                  n_queries=settings.n_queries,
                                  verbose=verbose,
                                  prior_included=prior_included,
                                  prior_excluded=prior_excluded,
                                  n_prior_included=settings.n_prior_included,
                                  n_prior_excluded=settings.n_prior_excluded,
                                  fit_kwargs=settings.fit_kwargs,
                                  balance_kwargs=settings.balance_kwargs,
                                  query_kwargs=settings.query_kwargs,
                                  logger=logger,
                                  **kwargs)

    elif mode == "oracle":
        reviewer = ReviewOracle(X,
                                model=model,
                                query_strategy=query_fn,
                                as_data=as_data,
                                train_data_fn=train_data_fn,
                                n_instances=settings.n_instances,
                                n_queries=settings.n_queries,
                                verbose=verbose,
                                prior_included=prior_included,
                                prior_excluded=prior_excluded,
                                fit_kwargs=settings.fit_kwargs,
                                balance_kwargs=settings.balance_kwargs,
                                query_kwargs=settings.query_kwargs,
                                logger=logger,
                                **kwargs)
    elif mode == "minimal":
        reviewer = MinimalReview(X,
                                 model=model,
                                 query_strategy=query_fn,
                                 train_data_fn=train_data_fn,
                                 n_instances=settings.n_instances,
                                 n_queries=settings.n_queries,
                                 verbose=verbose,
                                 prior_included=prior_included,
                                 prior_excluded=prior_excluded,
                                 fit_kwargs=settings.fit_kwargs,
                                 balance_kwargs=settings.balance_kwargs,
                                 query_kwargs=settings.query_kwargs,
                                 logger=logger,
                                 **kwargs)
    else:
        raise ValueError("Error finding mode, should never come here...")

    reviewer._logger.add_settings(settings)

    return reviewer
Пример #8
0
def test_csv_write_data():
    fp_in = Path("test", "demo_data", "generic_labels.csv")
    fp_out = Path("test", "out_data", "generic_out.csv")
    asr_data = ASReviewData.from_file(fp_in)
    asr_data.to_csv(fp_out, labels=[0, 1, 0, 1, 0, 1])