Пример #1
0
def export_to_string(project_id, export_type="csv"):
    fp_lock = get_lock_path(project_id)
    as_data = read_data(project_id)
    with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
        proba = read_proba(project_id)
        if proba is None:
            proba = np.flip(np.arange(len(as_data)))
        else:
            proba = np.array(proba)
        labels = read_current_labels(project_id, as_data=as_data)

    pool_idx = np.where(labels == LABEL_NA)[0]
    one_idx = np.where(labels == 1)[0]
    zero_idx = np.where(labels == 0)[0]

    proba_order = np.argsort(-proba[pool_idx])
    ranking = np.concatenate((one_idx, pool_idx[proba_order], zero_idx),
                             axis=None)

    if export_type == "csv":
        return as_data.to_csv(fp=None, labels=labels, ranking=ranking)
    if export_type == "excel":
        get_tmp_path(project_id).mkdir(exist_ok=True)
        fp_tmp_export = Path(get_tmp_path(project_id), "export_result.xlsx")
        return as_data.to_excel(fp=fp_tmp_export,
                                labels=labels,
                                ranking=ranking)
    else:
        raise ValueError("This export type isn't implemented.")
Пример #2
0
def remove_dataset_to_project(project_id, file_name):
    """Remove dataset from project

    """

    project_file_path = get_project_file_path(project_id)
    fp_lock = get_lock_path(project_id)

    with SQLiteLock(fp_lock,
                    blocking=True,
                    lock_name="active",
                    project_id=project_id):

        # open the projects file
        with open(project_file_path, "r") as f_read:
            project_dict = json.load(f_read)

        # remove the path from the project file
        data_fn = project_dict["dataset_path"]
        del project_dict["dataset_path"]

        with open(project_file_path, "w") as f_write:
            json.dump(project_dict, f_write)

        # files to remove
        data_path = get_data_file_path(project_id, data_fn)
        pool_path = get_pool_path(project_id)
        labeled_path = get_labeled_path(project_id)

        os.remove(str(data_path))
        os.remove(str(pool_path))
        os.remove(str(labeled_path))
Пример #3
0
def api_get_prior(project_id):  # noqa: F401
    """Get all papers classified as prior documents
    """

    subset = request.args.get('subset', default=None, type=str)

    # check if the subset exists
    if subset is not None and subset not in ["included", "excluded"]:
        message = "Unkown subset parameter"
        return jsonify(message=message), 400

    lock_fp = get_lock_path(project_id)
    with SQLiteLock(lock_fp, blocking=True, lock_name="active"):
        label_history = read_label_history(project_id, subset=subset)

    indices = [x[0] for x in label_history]

    records = read_data(project_id).record(indices)

    payload = {"result": []}
    for i, record in enumerate(records):

        payload["result"].append({
            "id": int(record.record_id),
            "title": record.title,
            "abstract": record.abstract,
            "authors": record.authors,
            "keywords": record.keywords,
            "included": int(label_history[i][1])
        })

    response = jsonify(payload)
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Пример #4
0
def api_get_prior(project_id):  # noqa: F401
    """Get all papers classified as prior documents
    """
    lock_fp = get_lock_path(project_id)
    with SQLiteLock(lock_fp, blocking=True, lock_name="active"):
        label_history = read_label_history(project_id)

    indices = [x[0] for x in label_history]

    records = read_data(project_id).record(indices)

    payload = {"result": []}
    for i, record in enumerate(records):

        payload["result"].append({
            "id": int(record.record_id),
            "title": record.title,
            "abstract": record.abstract,
            "authors": record.authors,
            "keywords": record.keywords,
            "included": int(label_history[i][1])
        })

    response = jsonify(payload)
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Пример #5
0
def get_statistics(project_id):
    fp_lock = get_lock_path(project_id)

    with SQLiteLock(fp_lock,
                    blocking=True,
                    lock_name="active",
                    project_id=project_id):
        # get the index of the active iteration
        label_history = read_label_history(project_id)
        current_labels = read_current_labels(project_id,
                                             label_history=label_history)

    n_since_last_inclusion = 0
    for _, inclusion in reversed(label_history):
        if inclusion == 1:
            break
        n_since_last_inclusion += 1

    n_included = len(np.where(current_labels == 1)[0])
    n_excluded = len(np.where(current_labels == 0)[0])
    n_papers = len(current_labels)
    stats = {
        "n_included": n_included,
        "n_excluded": n_excluded,
        "n_since_last_inclusion": n_since_last_inclusion,
        "n_papers": n_papers,
        "n_pool": n_papers - n_included - n_excluded
    }
    return stats
Пример #6
0
def label_instance(project_id, paper_i, label, retrain_model=True):
    """Label a paper after reviewing the abstract.

    """

    paper_i = int(paper_i)
    label = int(label)

    fp_lock = get_lock_path(project_id)

    with SQLiteLock(
            fp_lock, blocking=True, lock_name="active", project_id=project_id):

        # get the index of the active iteration
        if int(label) in [0, 1]:
            move_label_from_pool_to_labeled(project_id, paper_i, label)
        else:
            move_label_from_labeled_to_pool(project_id, paper_i)

    if retrain_model:
        # Update the model (if it isn't busy).

        py_exe = _get_executable()
        run_command = [py_exe, "-m", "asreview", "web_run_model", project_id]
        subprocess.Popen(run_command)
Пример #7
0
def export_to_string(project_id, export_type="csv"):

    # read the dataset into a ASReview data object
    as_data = read_data(project_id)

    # set the lock to safely read labeled, pool, and proba
    fp_lock = get_lock_path(project_id)
    with SQLiteLock(
            fp_lock,
            blocking=True,
            lock_name="active",
            project_id=project_id
    ):
        proba = read_proba(project_id)
        pool = read_pool(project_id)
        labeled = read_label_history(project_id)

    # get the record_id of the inclusions and exclusions
    inclusion_record_id = [int(x[0]) for x in labeled if x[1] == 1]
    exclusion_record_id = [int(x[0]) for x in labeled if x[1] == 0]

    # order the pool from high to low proba
    if proba is not None:
        pool_ordered = proba.loc[pool, :] \
            .sort_values("proba", ascending=False).index.values
    else:
        pool_ordered = pool_ordered

    # get the ranking of the 3 subcategories
    ranking = np.concatenate(
        (
            # add the inclusions first
            inclusion_record_id,
            # add the ordered pool second
            pool_ordered,
            # add the exclusions last
            exclusion_record_id
        ),
        axis=None
    )

    # export the data to file
    if export_type == "csv":
        return as_data.to_csv(fp=None, labels=labeled, ranking=ranking)

    if export_type == "tsv":
        return as_data.to_csv(
            fp=None, sep="\t", labels=labeled, ranking=ranking)

    if export_type == "excel":
        get_tmp_path(project_id).mkdir(exist_ok=True)
        fp_tmp_export = Path(get_tmp_path(project_id), "export_result.xlsx")
        return as_data.to_excel(
            fp=fp_tmp_export, labels=labeled, ranking=ranking)
    else:
        raise ValueError("This export type isn't implemented.")
Пример #8
0
def api_random_prior_papers(project_id):  # noqa: F401
    """Get a selection of random papers to find exclusions.

    This set of papers is extracted from the pool, but without
    the already labeled items.
    """

    lock_fp = get_lock_path(project_id)
    with SQLiteLock(lock_fp,
                    blocking=True,
                    lock_name="active",
                    project_id=project_id):
        pool = read_pool(project_id)

    #     with open(get_labeled_path(project_id, 0), "r") as f_label:
    #         prior_labeled = json.load(f_label)

    # excluded the already labeled items from our random selection.
    #     prior_labeled_index = [int(label) for label in prior_labeled.keys()]
    #     pool = [i for i in pool if i not in prior_labeled_index]

    # sample from the pool (this is already done atm of initializing
    # the pool. But doing it again because a double shuffle is always
    # best)

    try:
        pool_random = np.random.choice(pool, 1, replace=False)[0]
    except Exception:
        raise ValueError("Not enough random indices to sample from.")

    try:
        record = read_data(project_id).record(pool_random, by_index=False)

        payload = {"result": []}

        payload["result"].append({
            "id": int(record.record_id),
            "title": record.title,
            "abstract": record.abstract,
            "authors": record.authors,
            "keywords": record.keywords,
            "included": None
        })

    except Exception as err:
        logging.error(err)
        return jsonify(message="Failed to load random documents."), 500

    response = jsonify(payload)
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Пример #9
0
def add_dataset_to_project(project_id, file_name):
    """Add file path to the project file.

    Add file to data subfolder and fill the pool of iteration 0.
    """

    project_file_path = get_project_file_path(project_id)

    # clean temp project files
    clean_project_tmp_files(project_id)

    with SQLiteLock(
            get_lock_path(project_id),
            blocking=True,
            lock_name="active",
            project_id=project_id
    ):
        # open the projects file
        with open(project_file_path, "r") as f_read:
            project_dict = json.load(f_read)

        # add path to dict (overwrite if already exists)
        project_dict["dataset_path"] = file_name

        with open(project_file_path, "w") as f_write:
            json.dump(project_dict, f_write)

        # fill the pool of the first iteration
        as_data = read_data(project_id)

        if as_data.labels is not None:
            unlabeled = np.where(as_data.labels == LABEL_NA)[0]
            pool_indices = as_data.record_ids[unlabeled]

            labeled_indices = np.where(as_data.labels != LABEL_NA)[0]
            label_indices = list(zip(
                as_data.record_ids[labeled_indices].tolist(),
                as_data.labels[labeled_indices].tolist()
            ))
        else:
            pool_indices = as_data.record_ids
            label_indices = []

        np.random.shuffle(pool_indices)
        write_pool(project_id, pool_indices.tolist())

        # make a empty qeue for the items to label
        write_label_history(project_id, label_indices)
Пример #10
0
def get_instance(project_id):
    """Get a new instance to review.

    Arguments
    ---------
    project_id: str
        The id of the current project.
    """

    fp_lock = get_lock_path(project_id)

    with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
        pool_idx = read_pool(project_id)

    logging.info(f"Requesting {pool_idx[0]} from project {project_id}")
    return pool_idx[0]
Пример #11
0
def api_get_prior_stats(project_id):  # noqa: F401
    """Get all papers classified as prior documents
    """
    lock_fp = get_lock_path(project_id)
    with SQLiteLock(lock_fp, blocking=True, lock_name="active"):
        label_history = read_label_history(project_id)

    counter_prior = Counter([x[1] for x in label_history])

    response = jsonify({
        "n_prior": len(label_history),
        "n_inclusions": counter_prior[1],
        "n_exclusions": counter_prior[0]
    })
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Пример #12
0
def export_to_string(project_id):
    fp_lock = get_lock_path(project_id)
    as_data = read_data(project_id)
    with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
        proba = read_proba(project_id)
        if proba is None:
            proba = np.flip(np.arange(len(as_data)))
        else:
            proba = np.array(proba)
        labels = read_current_labels(project_id, as_data=as_data)

    pool_idx = np.where(labels == LABEL_NA)[0]
    one_idx = np.where(labels == 1)[0]
    zero_idx = np.where(labels == 0)[0]

    proba_order = np.argsort(-proba[pool_idx])
    ranking = np.concatenate((one_idx, pool_idx[proba_order], zero_idx),
                             axis=None)
    return as_data.to_csv(fp=None, labels=labels, ranking=ranking)
Пример #13
0
def get_instance(project_id):
    """Get a new instance to review.

    Arguments
    ---------
    project_id: str
        The id of the current project.
    """

    fp_lock = get_lock_path(project_id)

    with SQLiteLock(
            fp_lock, blocking=True, lock_name="active", project_id=project_id):
        pool_idx = read_pool(project_id)

    if len(pool_idx) > 0:
        return pool_idx[0]
    else:
        # end of pool
        return None
Пример #14
0
def add_dataset_to_project(project_id, file_name):
    """Add file path to the project file.

    Add file to data subfolder and fill the pool of iteration 0.
    """

    project_file_path = get_project_file_path(project_id)
    fp_lock = get_lock_path(project_id)

    with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
        # open the projects file
        with open(project_file_path, "r") as f_read:
            project_dict = json.load(f_read)

        # add path to dict (overwrite if already exists)
        project_dict["dataset_path"] = file_name

        with open(project_file_path, "w") as f_write:
            json.dump(project_dict, f_write)

        # fill the pool of the first iteration
        as_data = read_data(project_id)

        if as_data.labels is not None:
            unlabeled = np.where(as_data.labels == LABEL_NA)[0]
            pool_indices = as_data.record_ids[unlabeled]

            label_indices_included = \
                [[int(x), 1] for x in np.where(as_data.labels == 1)[0]]
            label_indices_excluded = \
                [[int(x), 0] for x in np.where(as_data.labels == 0)[0]]
            label_indices = label_indices_included + label_indices_excluded
        else:
            pool_indices = as_data.record_ids
            label_indices = []

        np.random.shuffle(pool_indices)
        write_pool(project_id, pool_indices.tolist())

        # make a empty qeue for the items to label
        write_label_history(project_id, label_indices)
Пример #15
0
def label_instance(project_id, paper_i, label, retrain_model=True):
    """Label a paper after reviewing the abstract.

    """

    paper_i = int(paper_i)
    label = int(label)

    fp_lock = get_lock_path(project_id)

    with SQLiteLock(fp_lock, blocking=True, lock_name="active"):

        # get the index of the active iteration
        if int(label) in [0, 1]:
            move_label_from_pool_to_labeled(project_id, paper_i, label)
        else:
            move_label_from_labeled_to_pool(project_id, paper_i, label)

    if retrain_model:
        # Update the model (if it isn't busy).
        run_command = f"python -m asreview web_run_model '{project_id}'"
        subprocess.Popen(shlex.split(run_command))
Пример #16
0
def get_statistics(project_id):
    """Get statistics from project files.

    Arguments
    ---------
    project_id: str
        The id of the current project.

    Returns
    -------
    dict:
        Dictonary with statistics.
    """
    fp_lock = get_lock_path(project_id)

    with SQLiteLock(
            fp_lock,
            blocking=True,
            lock_name="active",
            project_id=project_id
    ):
        # get the index of the active iteration
        labeled = read_label_history(project_id)
        pool = read_pool(project_id)

    # compute metrics
    n_included = n_relevant(labeled)
    n_excluded = n_irrelevant(labeled)
    n_pool = len(pool)

    return {
        "n_included": n_included,
        "n_excluded": n_excluded,
        "n_since_last_inclusion": stop_n_since_last_relevant(labeled),
        "n_papers": n_pool + n_included + n_excluded,
        "n_pool": n_pool
    }
Пример #17
0
def api_get_prior_stats(project_id):  # noqa: F401
    """Get all papers classified as prior documents
    """
    try:
        lock_fp = get_lock_path(project_id)
        with SQLiteLock(lock_fp,
                        blocking=True,
                        lock_name="active",
                        project_id=project_id):
            label_history = read_label_history(project_id)

        counter_prior = Counter([x[1] for x in label_history])

    except Exception as err:
        logging.error(err)
        return jsonify(message="Failed to load prior information."), 500

    response = jsonify({
        "n_prior": len(label_history),
        "n_inclusions": counter_prior[1],
        "n_exclusions": counter_prior[0]
    })
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response