예제 #1
0
def test_annotate_data(seeded_database, client, test_project_data, test_queue,
                       test_labels, test_admin_queue, test_irr_queue):
    '''This tests the basic ability to annotate a datum'''
    # get a datum from the queue
    project = test_project_data
    fill_queue(test_queue, 'random')
    request_info = {"labelID": test_labels[0].pk, "labeling_time": 3}
    permission_message = 'Account disabled by administrator.  Please contact project owner for details'
    # call annotate data without the user having permission. Check that
    # the data is not annotated and the response has an error.
    client.login(username=SEED_USERNAME, password=SEED_PASSWORD)
    client_profile = Profile.objects.get(user__username=SEED_USERNAME)

    data = get_assignments(client_profile, project, 1)
    response = client.post('/api/annotate_data/' + str(data[0].pk) + '/',
                           request_info)
    assert 'detail' in response.json() and permission_message in response.json(
    )['detail']

    assert DataLabel.objects.filter(data=data[0]).count() == 0
    ProjectPermissions.objects.create(profile=client_profile,
                                      project=project,
                                      permission='CODER')

    # give the user permission and call annotate again
    # The data should be labeled and in the proper places
    # check that the response was {} (no error)
    response = client.post('/api/annotate_data/' + str(data[0].pk) + '/',
                           request_info)
    assert 'error' not in response.json() and 'detail' not in response.json()
    assert DataLabel.objects.filter(data=data[0]).count() == 1
    assert DataQueue.objects.filter(data=data[0]).count() == 0
예제 #2
0
파일: test_api.py 프로젝트: taggsoft/SMART
def test_download_labeled_data(seeded_database, client, admin_client,
                               test_project_labeled, test_queue_labeled,
                               test_irr_queue_labeled,
                               test_admin_queue_labeled):
    '''
    This tests the download labeled data api call
    '''
    project = test_project_labeled
    fill_queue(test_queue_labeled, 'random', test_irr_queue_labeled,
               project.percentage_irr, project.batch_size)

    admin_client.login(username=SEED_USERNAME2, password=SEED_PASSWORD2)
    admin_profile = Profile.objects.get(user__username=SEED_USERNAME2)
    ProjectPermissions.objects.create(profile=admin_profile,
                                      project=project,
                                      permission='ADMIN')

    client.login(username=SEED_USERNAME, password=SEED_PASSWORD)
    client_profile = Profile.objects.get(user__username=SEED_USERNAME)

    ProjectPermissions.objects.create(profile=client_profile,
                                      project=project,
                                      permission='CODER')
    # check admin priviledges
    response = client.get('/api/download_data/' + str(project.pk) + '/').json()
    assert 'detail' in response and 'Invalid permission. Must be an admin' in response[
        'detail']

    # check that the response is the correct type
    response = admin_client.get('/api/download_data/' + str(project.pk) + '/')
    assert 'detail' not in response
    assert response.get("Content-Type") == "text/csv"
예제 #3
0
def test_unassign_after_fillqueue(db, test_profile, test_project_data,
                                  test_queue, test_labels, test_redis):
    fill_queue(test_queue, 'random')

    assert test_redis.llen('queue:' + str(test_queue.pk)) == test_queue.length
    assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length

    data = get_assignments(test_profile, test_project_data, 10)

    assert test_redis.llen('queue:' +
                           str(test_queue.pk)) == (test_queue.length - 10)
    assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length

    test_label = test_labels[0]
    for i in range(5):
        label_data(test_label, data[i], test_profile, 3)

    assert test_redis.llen('queue:' +
                           str(test_queue.pk)) == (test_queue.length - 10)
    assert test_redis.scard('set:' +
                            str(test_queue.pk)) == (test_queue.length - 5)

    fill_queue(test_queue, 'random')

    assert test_redis.llen('queue:' +
                           str(test_queue.pk)) == test_queue.length - 5
    assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length
예제 #4
0
def test_queue_refill(setup_celery, test_project_data, test_all_queues,
                      test_profile, test_labels, test_redis, tmpdir, settings):
    '''
    Check that the queues refill the way they should.
    Have one person label everything in a batch. Check that the queue refills but the irr queue now has twice the irr% * batch amount
    '''
    project = test_project_data
    normal_queue, admin_queue, irr_queue = test_all_queues
    fill_queue(normal_queue, 'random', irr_queue, project.percentage_irr,
               project.batch_size)

    irr_count = math.ceil((project.percentage_irr / 100) * project.batch_size)
    non_irr_count = math.ceil(
        ((100 - project.percentage_irr) / 100) * project.batch_size)

    for i in range(non_irr_count):
        datum = assign_datum(test_profile, project, "normal")
        assert datum is not None
        label_data(test_labels[0], datum, test_profile, 3)
        check_and_trigger_model(datum, test_profile)
    for i in range(irr_count):
        datum = assign_datum(test_profile, project, "irr")
        assert datum is not None
        label_data(test_labels[0], datum, test_profile, 3)
        check_and_trigger_model(datum, test_profile)
    assert DataQueue.objects.filter(
        queue=normal_queue).count() == non_irr_count
    assert DataQueue.objects.filter(queue=irr_queue).count() == irr_count * 2
예제 #5
0
def upload_data(form_data, project, queue=None, irr_queue=None, batch_size=30):
    """Perform data upload given validated form_data.

    1. Add data to database
    2. If new project then fill queue (only new project will pass queue object)
    3. Save the uploaded data file
    4. Create tf_idf file
    5. Check and Trigger model
    """
    new_df = add_data(project, form_data)
    if queue:
        fill_queue(queue=queue,
                   irr_queue=irr_queue,
                   orderby='random',
                   irr_percent=project.percentage_irr,
                   batch_size=batch_size)

    # Since User can upload Labeled Data and this data is added to current training_set
    # we need to check_and_trigger model.  However since training model requires
    # tf_idf to be created we must create a chord which garuntees that tfidf
    # creation task is completed before check and trigger model task

    if len(new_df) > 0:
        save_data_file(new_df, project.pk)
        if project.classifier is not None:
            transaction.on_commit(lambda: chord(
                tasks.send_tfidf_creation_task.s(project.pk),
                tasks.send_check_and_trigger_model_task.si(project.pk)).
                                  apply_async())
예제 #6
0
def send_model_task(project_pk):
    """Trains, Saves, Predicts, Fills Queue."""
    from core.models import Project, TrainingSet
    from core.utils.utils_model import predict_data, train_and_save_model
    from core.utils.utils_queue import fill_queue, find_queue_length

    project = Project.objects.get(pk=project_pk)
    queue = project.queue_set.get(type="normal")
    irr_queue = project.queue_set.get(type="irr")
    al_method = project.learning_method
    batch_size = project.batch_size

    model = train_and_save_model(project)
    if al_method != "random":
        predict_data(project, model)
    TrainingSet.objects.create(
        project=project,
        set_number=project.get_current_training_set().set_number + 1)

    # Determine if queue size has changed (num_coders changed) and re-fill queue
    num_coders = len(project.projectpermissions_set.all()) + 1
    q_length = find_queue_length(batch_size, num_coders)
    if q_length != queue.length:
        queue.length = q_length
        queue.save()

    fill_queue(
        queue,
        irr_queue=irr_queue,
        orderby=al_method,
        irr_percent=project.percentage_irr,
        batch_size=batch_size,
    )
예제 #7
0
def test_fill_half_irr_queues(setup_celery, test_project_half_irr_data,
                              test_half_irr_all_queues, test_profile,
                              test_redis, tmpdir, settings):
    '''
    Using a project with equal irr settings (50%, 2),
    check that the normal and irr queues get filled correctly
    '''
    normal_queue, admin_queue, irr_queue = test_half_irr_all_queues
    batch_size = test_project_half_irr_data.batch_size
    percentage_irr = test_project_half_irr_data.percentage_irr
    fill_queue(normal_queue, 'random', irr_queue, percentage_irr, batch_size)

    # check that the queue is filled with the correct proportion of IRR and not
    irr_count = math.ceil((percentage_irr / 100) * batch_size)
    non_irr_count = math.ceil(((100 - percentage_irr) / 100) * batch_size)
    num_in_norm = DataQueue.objects.filter(queue=normal_queue).count()
    num_in_irr = DataQueue.objects.filter(queue=irr_queue).count()
    assert (num_in_norm + num_in_irr) == batch_size
    assert num_in_norm == non_irr_count
    assert num_in_irr == irr_count
    assert num_in_norm == num_in_irr

    # check that all of the data in the irr queue is labeled irr_ind=True
    assert DataQueue.objects.filter(queue=irr_queue,
                                    data__irr_ind=False).count() == 0
    # check that NONE of the data in the normal queue is irr_ind=True
    assert DataQueue.objects.filter(queue=normal_queue,
                                    data__irr_ind=True).count() == 0
    # check that there is no duplicate data across the two queues
    data_irr = DataQueue.objects.filter(queue=irr_queue).values_list(
        'data__hash', flat=True)
    data_norm = DataQueue.objects.filter(queue=normal_queue).values_list(
        'data__hash', flat=True)
    assert len(set(data_irr) & set(data_norm)) == 0
예제 #8
0
def test_get_labeled_data(setup_celery, test_profile, test_project_labeled,
                          test_queue_labeled, test_irr_queue_labeled,
                          test_admin_queue_labeled, test_redis, tmpdir,
                          settings):
    '''
    This tests that the labeled data is pulled correctly
    '''
    # This tests labeled data util call
    project = test_project_labeled
    project_labels = Label.objects.filter(project=project)
    fill_queue(test_queue_labeled, 'random', test_irr_queue_labeled,
               project.percentage_irr, project.batch_size)

    # get the labeled data and the labels
    labeled_data, labels = get_labeled_data(project)
    assert isinstance(labeled_data, pd.DataFrame)
    assert isinstance(labels, pd.DataFrame)

    # should have the same number of labels and labeled data as in project
    assert len(labels) == len(project_labels)

    project_labeled = DataLabel.objects.filter(data__project=project)
    assert len(labeled_data) == len(project_labeled)

    # check that the labeled data is returned matches the stuff in DataLabel
    assert len(
        set(project_labeled.values_list("data__upload_id", flat=True))
        & set(labeled_data["ID"].tolist())) == len(labeled_data)
예제 #9
0
def test_model_task_redis_no_dupes_data_left_in_queue(
        test_project_labeled_and_tfidf, test_queue_labeled,
        test_irr_queue_labeled, test_admin_queue_labeled, test_redis, tmpdir,
        settings):
    project = test_project_labeled_and_tfidf
    initial_training_set = project.get_current_training_set().set_number
    queue = project.queue_set.get(type="normal")
    queue.length = 40
    queue.save()

    irr_queue = project.queue_set.get(type="irr")
    irr_queue.length = 40
    irr_queue.save()

    model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles')
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    batch_size = project.batch_size
    fill_queue(queue,
               'random',
               irr_queue,
               irr_percent=project.percentage_irr,
               batch_size=batch_size)

    labels = project.labels.all()
    for i in range(int(batch_size * ((100 - project.percentage_irr) / 100))):
        datum = assign_datum(project.creator, project)
        label_data(random.choice(labels), datum, project.creator, 3)

    tasks.send_model_task.delay(project.pk).get()
    assert project.get_current_training_set(
    ).set_number == initial_training_set + 1
    redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1)
    assert len(redis_items) == len(set(redis_items))
def test_cohens_kappa_perc_agreement_no_agreement(
    setup_celery,
    test_project_half_irr_data,
    test_half_irr_all_queues,
    test_profile,
    test_profile2,
    test_labels_half_irr,
    test_redis,
    tmpdir,
    settings,
):
    """This just tests the kappa and percent if nobody ever agreed."""
    project = test_project_half_irr_data
    labels = test_labels_half_irr
    normal_queue, admin_queue, irr_queue = test_half_irr_all_queues
    fill_queue(
        normal_queue, "random", irr_queue, project.percentage_irr, project.batch_size
    )

    # label 5 irr elements but disagree on all of them
    for i in range(5):
        datum = assign_datum(test_profile, project, "irr")
        assign_datum(test_profile2, project, "irr")
        label_data(labels[i % 3], datum, test_profile, 3)
        label_data(labels[(i + 1) % 3], datum, test_profile2, 3)
    kappa, perc = cohens_kappa(project)
    assert round(kappa, 3) == -0.471
    assert perc == 0.0
예제 #11
0
def test_unassign(db, test_profile, test_project_data, test_queue, test_redis):
    fill_queue(test_queue, orderby='random')

    assert test_redis.llen('queue:' + str(test_queue.pk)) == test_queue.length
    assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length

    datum = get_assignments(test_profile, test_project_data, 1)[0]

    assert test_redis.llen('queue:' +
                           str(test_queue.pk)) == (test_queue.length - 1)
    assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length
    assert AssignedData.objects.filter(data=datum,
                                       profile=test_profile).exists()

    unassign_datum(datum, test_profile)

    assert test_redis.llen('queue:' + str(test_queue.pk)) == test_queue.length
    assert test_redis.scard('set:' + str(test_queue.pk)) == test_queue.length
    assert not AssignedData.objects.filter(data=datum,
                                           profile=test_profile).exists()

    # The unassigned datum should be the next to be assigned
    reassigned_datum = get_assignments(test_profile, test_project_data, 1)[0]

    assert reassigned_datum == datum
예제 #12
0
def test_percent_agree_table(seeded_database, client, admin_client, test_project_all_irr_data, test_all_irr_all_queues, test_labels_all_irr):
    '''
    This tests that the percent agree table can be called and returns correctly.
    Note: the exact values of the table are checked in the util tests.
    '''
    labels = test_labels_all_irr
    normal_queue, admin_queue, irr_queue = test_all_irr_all_queues
    project = test_project_all_irr_data

    client.login(username=SEED_USERNAME, password=SEED_PASSWORD)
    client_profile = Profile.objects.get(user__username=SEED_USERNAME)
    ProjectPermissions.objects.create(profile=client_profile,
                                      project=project,
                                      permission='CODER')
    admin_client.login(username=SEED_USERNAME2, password=SEED_PASSWORD2)
    admin_profile = Profile.objects.get(user__username=SEED_USERNAME2)
    ProjectPermissions.objects.create(profile=admin_profile,
                                      project=project,
                                      permission='ADMIN')
    third_profile = Profile.objects.get(user__username="******")
    fill_queue(normal_queue, 'random', irr_queue, project.percentage_irr, project.batch_size)

    # non-admin should not be able to call the test
    response = client.get('/api/perc_agree_table/' + str(project.pk) + '/')
    assert 403 == response.status_code and "Invalid permission. Must be an admin" in str(
        response.content)

    data = get_assignments(client_profile, project, 15)
    data2 = get_assignments(admin_profile, project, 15)
    for i in range(15):
        response = admin_client.post('/api/annotate_data/' + str(data[i].pk) + '/', {
                                     "labelID": labels[i % 3].pk,
                                     "labeling_time": 3
                                     })
        assert 'error' not in response.json() and 'detail' not in response.json()
        response = client.post('/api/annotate_data/' + str(data2[i].pk) + '/', {
                               "labelID": labels[i % 3].pk,
                               "labeling_time": 3
                               })
        assert 'error' not in response.json() and 'detail' not in response.json()
    # check that the three user pairs are in table
    response = admin_client.get('/api/perc_agree_table/' + str(project.pk) + '/').json()
    assert 'data' in response
    response_frame = pd.DataFrame(response['data'])
    # should have combination [adm, cl] [adm, u3], [cl, u3]
    assert response_frame['First Coder'].tolist() == [SEED_USERNAME, SEED_USERNAME, SEED_USERNAME2]
    assert response_frame['Second Coder'].tolist(
    ) == [SEED_USERNAME2, str(third_profile), str(third_profile)]

    # check that the table has just those three combinations
    assert len(response_frame) == 3

    # should have "no samples" for combos with user3
    assert response_frame.loc[response_frame['Second Coder'] == str(
        third_profile)]["Percent Agreement"].tolist() == ["No samples", "No samples"]

    # check that the percent agreement matches n%, n between 0 and 100
    perc = response_frame["Percent Agreement"].tolist()[0]
    assert float(perc[:len(perc) - 1]) <= 100 and float(perc[:len(perc) - 1]) >= 0
def test_redis_parse_list_dataids(test_queue, test_redis):
    fill_queue(test_queue, orderby="random")

    data_ids = [d.pk for d in test_queue.data.all()]
    redis_ids = test_redis.lrange(redis_serialize_queue(test_queue), 0, -1)
    parsed_ids = redis_parse_list_dataids(redis_ids)

    assert data_ids.sort() == parsed_ids.sort()
def test_redis_parse_data(test_queue, test_redis):
    fill_queue(test_queue, orderby="random")

    popped_data_key = test_redis.lpop(redis_serialize_queue(test_queue))
    parsed_data = redis_parse_data(popped_data_key)

    assert_obj_exists(Data, {"pk": parsed_data.pk})
    assert_obj_exists(DataQueue, {"data_id": parsed_data.pk})
def test_fill_queue_all_remaining_data(db, test_queue):
    # Raise the queue length so it's bigger than the amount of data available
    all_data_count = Data.objects.filter(project=test_queue.project).count()
    test_queue.length = all_data_count + 1
    test_queue.save()

    fill_queue(test_queue, orderby="random")
    assert test_queue.data.count() == all_data_count
def test_init_redis_one_nonempty_queue(db, test_project_data, test_redis):
    queue = add_queue(test_project_data, 10)
    fill_queue(queue, orderby="random")

    test_redis.flushdb()
    init_redis()

    assert_redis_matches_db(test_redis)
예제 #17
0
def test_restore_data(
    seeded_database,
    client,
    admin_client,
    test_project_data,
    test_queue,
    test_irr_queue,
    test_labels,
    test_admin_queue,
):
    """This tests that data can be restored after it is discarded."""
    project = test_project_data
    fill_queue(
        test_queue, "random", test_irr_queue, project.percentage_irr, project.batch_size
    )

    admin_client.login(username=SEED_USERNAME2, password=SEED_PASSWORD2)
    admin_profile = Profile.objects.get(user__username=SEED_USERNAME2)
    ProjectPermissions.objects.create(
        profile=admin_profile, project=project, permission="ADMIN"
    )

    client.login(username=SEED_USERNAME, password=SEED_PASSWORD)
    client_profile = Profile.objects.get(user__username=SEED_USERNAME)

    ProjectPermissions.objects.create(
        profile=client_profile, project=project, permission="CODER"
    )

    # assign a batch of data. Should be IRR and non-IRR
    data = get_assignments(client_profile, project, 30)
    for i in range(30):
        response = client.post("/api/skip_data/" + str(data[i].pk) + "/")

    # have the admin also get a batch and call skip on everything
    data = get_assignments(admin_profile, project, 30)
    for i in range(30):
        response = admin_client.post("/api/skip_data/" + str(data[i].pk) + "/")

    admin_data = DataQueue.objects.filter(data__project=project, queue=test_admin_queue)
    # discard all data
    for datum in admin_data:
        admin_client.post("/api/discard_data/" + str(datum.data.pk) + "/")

    # check for admin privalidges
    response = client.post(
        "/api/restore_data/" + str(admin_data[0].data.pk) + "/"
    ).json()
    assert (
        "detail" in response
        and "Invalid permission. Must be an admin" in response["detail"]
    )

    # restore all data. It should not be in recycle bin
    for datum in admin_data:
        admin_client.post("/api/restore_data/" + str(datum.data.pk) + "/")
        assert RecycleBin.objects.filter(data=datum.data).count() == 0
        assert not Data.objects.get(pk=datum.data.pk).irr_ind
예제 #18
0
def test_model_task_redis_no_dupes_data_unassign_assigned_data(
        test_project_labeled_and_tfidf, test_queue_labeled,
        test_irr_queue_labeled, test_admin_queue_labeled, test_redis, tmpdir,
        settings):
    project = test_project_labeled_and_tfidf
    person2 = create_profile('test_profilezzz', 'password',
                             '*****@*****.**')
    person3 = create_profile('test_profile2', 'password',
                             '*****@*****.**')
    ProjectPermissions.objects.create(profile=person2,
                                      project=project,
                                      permission='CODER')
    ProjectPermissions.objects.create(profile=person3,
                                      project=project,
                                      permission='CODER')
    initial_training_set = project.get_current_training_set().set_number
    queue = project.queue_set.get(type="normal")
    queue.length = 40
    queue.save()

    irr_queue = project.queue_set.get(type="irr")
    irr_queue.length = 40
    irr_queue.save()

    model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles')
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    batch_size = project.batch_size
    fill_queue(queue,
               'random',
               irr_queue,
               irr_percent=project.percentage_irr,
               batch_size=batch_size)

    labels = project.labels.all()
    assignments = get_assignments(project.creator, project, batch_size)
    for assignment in assignments:
        label_data(random.choice(labels), assignment, project.creator, 3)

    tasks.send_model_task.delay(project.pk).get()
    assert project.get_current_training_set(
    ).set_number == initial_training_set + 1
    redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1)
    assert len(redis_items) == len(set(redis_items))

    assignments = get_assignments(project.creator, project, 40)
    for assignment in assignments[:batch_size]:
        label_data(random.choice(labels), assignment, project.creator, 3)

    tasks.send_model_task.delay(project.pk).get()
    assert project.get_current_training_set(
    ).set_number == initial_training_set + 2
    redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1)
    assert len(redis_items) == len(set(redis_items))

    batch_unassign(project.creator)
    redis_items = test_redis.lrange(redis_serialize_queue(queue), 0, -1)
    assert len(redis_items) == len(set(redis_items))
예제 #19
0
def test_skip_irr(
    setup_celery,
    test_project_half_irr_data,
    test_half_irr_all_queues,
    test_profile,
    test_profile2,
    test_profile3,
    test_labels_half_irr,
    test_redis,
    tmpdir,
    settings,
):
    """This tests the skip function, and see if the data is in the correct places."""
    project = test_project_half_irr_data
    normal_queue, admin_queue, irr_queue = test_half_irr_all_queues
    fill_queue(normal_queue, "random", irr_queue, project.percentage_irr,
               project.batch_size)
    # get an irr datum. One should exist.
    datum = assign_datum(test_profile, project, "irr")
    assert datum is not None

    # let one user skip an irr datum. It should not be in adminqueue, should be in irr queue,
    # should be in irrlog, should be in irr queue, not be in datalabel
    skip_data(datum, test_profile)
    assert DataQueue.objects.filter(data=datum, queue=admin_queue).count() == 0
    assert DataQueue.objects.filter(data=datum, queue=irr_queue).count() == 1
    assert IRRLog.objects.filter(data=datum, profile=test_profile).count() == 1
    assert DataLabel.objects.filter(data=datum,
                                    profile=test_profile).count() == 0

    # let the other user skip the data. It should be in admin queue,
    # IRRlog (twice), and nowhere else.
    datum2 = assign_datum(test_profile2, project, "irr")
    assert datum.pk == datum2.pk
    skip_data(datum2, test_profile2)
    assert DataQueue.objects.filter(data=datum, queue=admin_queue).count() == 1
    assert DataQueue.objects.filter(data=datum, queue=irr_queue).count() == 0
    assert IRRLog.objects.filter(data=datum).count() == 2
    assert DataLabel.objects.filter(data=datum).count() == 0

    # have two users label an IRR datum then have a third user skip it.
    # It should be in the IRRLog but not in admin queue or anywhere else.
    second_datum = assign_datum(test_profile, project, "irr")
    second_datum2 = assign_datum(test_profile2, project, "irr")
    assert second_datum.pk != datum.pk
    assert second_datum.pk == second_datum2.pk
    second_datum3 = assign_datum(test_profile3, project, "irr")
    assert second_datum2.pk == second_datum3.pk

    label_data(test_labels_half_irr[0], second_datum, test_profile, 3)
    label_data(test_labels_half_irr[0], second_datum2, test_profile2, 3)
    skip_data(second_datum3, test_profile3)
    assert DataQueue.objects.filter(data=second_datum3,
                                    queue=admin_queue).count() == 0
    assert DataQueue.objects.filter(data=second_datum3,
                                    queue=irr_queue).count() == 0
    assert IRRLog.objects.filter(data=second_datum3).count() == 3
    assert DataLabel.objects.filter(data=second_datum3).count() == 1
예제 #20
0
def test_annotate_irr(setup_celery, test_project_half_irr_data,
                      test_half_irr_all_queues, test_profile, test_profile2,
                      test_profile3, test_labels_half_irr, test_redis, tmpdir,
                      settings):
    '''
    This tests the irr labeling workflow, and checks that the data is in the correct models
    '''
    project = test_project_half_irr_data
    normal_queue, admin_queue, irr_queue = test_half_irr_all_queues
    fill_queue(normal_queue, 'random', irr_queue, project.percentage_irr,
               project.batch_size)
    # get an irr datum. One should exist.
    datum = assign_datum(test_profile, project, "irr")
    assert datum is not None

    # let one user label a datum. It should be in DataLabel, not be in IRRLog,
    # still be in IRR Queue
    label_data(test_labels_half_irr[0], datum, test_profile, 3)
    assert DataLabel.objects.filter(data=datum,
                                    profile=test_profile).count() > 0
    assert IRRLog.objects.filter(data=datum, profile=test_profile).count() == 0
    assert DataQueue.objects.filter(data=datum, queue=irr_queue).count() > 0

    datum2 = assign_datum(test_profile2, project, "irr")
    assert datum.pk == datum2.pk

    datum3 = assign_datum(test_profile3, project, "irr")
    assert datum.pk == datum3.pk

    # let other user label the same datum. It should now be in datatable with
    # creater=profile, be in IRRLog (twice), not be in IRRQueue
    label_data(test_labels_half_irr[0], datum2, test_profile2, 3)
    assert DataLabel.objects.filter(data=datum2).count() == 1
    assert DataLabel.objects.get(data=datum2).profile.pk == project.creator.pk
    assert IRRLog.objects.filter(data=datum2).count() == 2
    assert DataQueue.objects.filter(data=datum2, queue=irr_queue).count() == 0

    # let a third user label the first data something else. It should be in
    # IRRLog but not overwrite the label from before
    label_data(test_labels_half_irr[0], datum3, test_profile3, 3)
    assert IRRLog.objects.filter(data=datum3).count() == 3
    assert DataLabel.objects.filter(data=datum3).count() == 1
    assert DataLabel.objects.get(data=datum3).profile.pk == project.creator.pk

    # let two users disagree on a datum. It should be in the admin queue,
    # not in irr queue, not in datalabel, in irrlog twice
    second_datum = assign_datum(test_profile, project, "irr")
    # should be a new datum
    assert datum.pk != second_datum.pk
    second_datum2 = assign_datum(test_profile2, project, "irr")
    label_data(test_labels_half_irr[0], second_datum, test_profile, 3)
    label_data(test_labels_half_irr[1], second_datum2, test_profile2, 3)
    assert DataQueue.objects.filter(data=second_datum2,
                                    queue=admin_queue).count() == 1
    assert DataQueue.objects.filter(data=second_datum2,
                                    queue=irr_queue).count() == 0
    assert DataLabel.objects.filter(data=second_datum2).count() == 0
    assert IRRLog.objects.filter(data=second_datum2).count() == 2
예제 #21
0
def test_get_irr_metrics(seeded_database, client, admin_client, test_project_half_irr_data, test_half_irr_all_queues, test_labels_half_irr):
    '''
    This tests the irr metrics api call.
    Note: the exact values are checked in the util tests.
    '''

    # sign in users
    labels = test_labels_half_irr
    normal_queue, admin_queue, irr_queue = test_half_irr_all_queues
    project = test_project_half_irr_data

    client.login(username=SEED_USERNAME, password=SEED_PASSWORD)
    client_profile = Profile.objects.get(user__username=SEED_USERNAME)
    ProjectPermissions.objects.create(profile=client_profile,
                                      project=project,
                                      permission='CODER')
    admin_client.login(username=SEED_USERNAME2, password=SEED_PASSWORD2)
    admin_profile = Profile.objects.get(user__username=SEED_USERNAME2)
    ProjectPermissions.objects.create(profile=admin_profile,
                                      project=project,
                                      permission='ADMIN')

    fill_queue(normal_queue, 'random', irr_queue, project.percentage_irr, project.batch_size)

    # non-admin should not be able to call the test
    response = client.get('/api/get_irr_metrics/' + str(project.pk) + '/')
    assert 403 == response.status_code and "Invalid permission. Must be an admin" in str(
        response.content)

    # initially, should have no irr data processed
    response = admin_client.get('/api/get_irr_metrics/' + str(project.pk) + '/').json()
    assert 'error' not in response and 'detail' not in response
    assert 'kappa' in response and response['kappa'] == "No irr data processed"
    assert 'percent agreement' in response and response['percent agreement'] == "No irr data processed"

    # have each person label three irr data
    data = get_assignments(client_profile, project, 3)
    data2 = get_assignments(admin_profile, project, 3)
    for i in range(3):
        response = client.post('/api/annotate_data/' + str(data[i].pk) + '/', {
                               "labelID": labels[i].pk,
                               "labeling_time": 3
                               })
        assert 'error' not in response.json() and 'detail' not in response.json()
        response = admin_client.post('/api/annotate_data/' + str(data2[i].pk) + '/', {
                                     "labelID": labels[(i + 1) % 3].pk,
                                     "labeling_time": 3
                                     })
        assert 'error' not in response.json()

    response = admin_client.get('/api/get_irr_metrics/' + str(project.pk) + '/').json()
    # the percent agreement should be a number between 0 and 100 with a %
    assert 'percent agreement' in response
    percent = float(response['percent agreement'][:len(response['percent agreement']) - 1])
    assert percent <= 100 and percent >= 0 and '%' == response['percent agreement'][-1]
    # kappa should be a value between -1 and 1
    assert 'kappa' in response and response['kappa'] >= -1 and response['kappa'] <= 1
예제 #22
0
def test_assign_datum_profile_queue_returns_correct_datum(
        db, test_profile_queue, test_profile, test_profile_queue2,
        test_profile2, test_redis):
    fill_queue(test_profile_queue, orderby='random')
    fill_queue(test_profile_queue2, orderby='random')

    datum = assign_datum(test_profile, test_profile_queue.project)

    assert isinstance(datum, Data)
def test_assign_datum_project_queue_returns_datum(db, test_queue, test_profile,
                                                  test_redis):
    """Assign a datum from a project-wide queue (null profile ID)."""
    fill_queue(test_queue, orderby="random")

    datum = assign_datum(test_profile, test_queue.project)

    # Make sure we got the datum
    assert isinstance(datum, Data)
예제 #24
0
def test_fill_queue_random_predicted_data(test_project_predicted_data,
                                          test_queue, test_redis):
    fill_queue(test_queue, 'random')

    assert_redis_matches_db(test_redis)
    assert test_queue.data.count() == test_queue.length
    for datum in test_queue.data.all():
        assert len(datum.datalabel_set.all()) == 0
        assert_obj_exists(DataUncertainty, {'data': datum})
def test_g_naivebayes_classifier(
    setup_celery,
    test_project_gnb_data_tfidf,
    test_gnb_labels,
    test_gnb_queue_list,
    test_profile,
    test_redis,
    tmpdir,
    settings,
):
    """This tests that a project with the Gaussian Naiive Bayes classifier can
    successfully train and give predictions for a model."""
    normal_queue, admin_queue, irr_queue = test_gnb_queue_list
    labels = test_gnb_labels
    project = test_project_gnb_data_tfidf

    active_l = project.learning_method
    batch_size = project.batch_size
    initial_training_set = project.get_current_training_set()
    model_path_temp = tmpdir.listdir()[0].mkdir("model_pickles")
    settings.MODEL_PICKLE_PATH = str(model_path_temp)

    assert project.classifier == "gnb"
    assert active_l == "least confident"

    fill_queue(normal_queue, "random")

    assert DataQueue.objects.filter(queue=normal_queue).count() == batch_size

    for i in range(batch_size):
        datum = assign_datum(test_profile, project)
        label_data(labels[i % 3], datum, test_profile, 3)

    ret_str = check_and_trigger_model(datum)
    assert ret_str == "model running"

    # Assert model created and saved
    assert_obj_exists(Model, {"project": project})
    model = Model.objects.get(project=project)
    assert os.path.isfile(model.pickle_path)
    assert model.pickle_path == os.path.join(
        str(model_path_temp),
        "project_"
        + str(project.pk)
        + "_training_"
        + str(initial_training_set.set_number)
        + ".pkl",
    )

    # Assert predictions created
    predictions = DataPrediction.objects.filter(data__project=project)
    assert (
        len(predictions)
        == Data.objects.filter(project=project, labelers=None).count()
        * project.labels.count()
    )
def test_init_redis_multiple_queues(db, test_project_data, test_redis):
    queue = add_queue(test_project_data, 10)
    fill_queue(queue, orderby="random")

    add_queue(test_project_data, 10)

    test_redis.flushdb()
    init_redis()

    assert_redis_matches_db(test_redis)
예제 #27
0
def test_redis_parse_queue(test_queue, test_redis):
    fill_queue(test_queue, orderby='random')

    queue_key = [key for key in test_redis.keys()
                 if 'queue' in key.decode()][0]
    parsed_queue = redis_parse_queue(queue_key)

    assert parsed_queue.pk == test_queue.pk
    assert_obj_exists(DataQueue, {'queue_id': parsed_queue.pk})
    assert_obj_exists(Queue, {'pk': parsed_queue.pk})
def test_redis_parse_queue(test_queue, test_redis):
    fill_queue(test_queue, orderby="random")

    queue_key = [key for key in test_redis.keys()
                 if "queue" in key.decode()][0]
    parsed_queue = redis_parse_queue(queue_key)

    assert parsed_queue.pk == test_queue.pk
    assert_obj_exists(DataQueue, {"queue_id": parsed_queue.pk})
    assert_obj_exists(Queue, {"pk": parsed_queue.pk})
def test_fill_nonempty_queue(db, test_queue):
    # Manually add one observation so the queue is now nonempty
    test_datum = Data.objects.create(
        text="test data", project=test_queue.project, upload_id_hash=md5_hash(0)
    )
    DataQueue.objects.create(data=test_datum, queue=test_queue)
    assert test_queue.data.count() == 1

    fill_queue(test_queue, orderby="random")
    assert test_queue.data.count() == test_queue.length
예제 #30
0
def test_pop_first_nonempty_queue_single_queue(db, test_project_data,
                                               test_queue, test_redis):
    fill_queue(test_queue, orderby='random')

    queue, data = pop_first_nonempty_queue(test_project_data)

    assert isinstance(queue, Queue)
    assert queue == test_queue

    assert isinstance(data, Data)