def test_queue_refill(setup_celery, test_project_data, test_all_queues, test_profile, test_labels, test_redis, tmpdir, settings): ''' Check that the queues refill the way they should. Have one person label everything in a batch. Check that the queue refills but the irr queue now has twice the irr% * batch amount ''' project = test_project_data normal_queue, admin_queue, irr_queue = test_all_queues fill_queue(normal_queue, 'random', irr_queue, project.percentage_irr, project.batch_size) irr_count = math.ceil((project.percentage_irr / 100) * project.batch_size) non_irr_count = math.ceil( ((100 - project.percentage_irr) / 100) * project.batch_size) for i in range(non_irr_count): datum = assign_datum(test_profile, project, "normal") assert datum is not None label_data(test_labels[0], datum, test_profile, 3) check_and_trigger_model(datum, test_profile) for i in range(irr_count): datum = assign_datum(test_profile, project, "irr") assert datum is not None label_data(test_labels[0], datum, test_profile, 3) check_and_trigger_model(datum, test_profile) assert DataQueue.objects.filter( queue=normal_queue).count() == non_irr_count assert DataQueue.objects.filter(queue=irr_queue).count() == irr_count * 2
def test_check_and_trigger_queue_changes_success( setup_celery, test_project_labeled_and_tfidf, test_queue_labeled, test_irr_queue_labeled, test_redis, tmpdir, settings, test_profile2): project = test_project_labeled_and_tfidf test_queue = test_queue_labeled initial_training_set = project.get_current_training_set() model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) # Add another user to permissions ProjectPermissions.objects.create(profile=test_profile2, project=project, permission='CODER') datum = DataLabel.objects.filter(data__project=project).first().data check = check_and_trigger_model(datum) assert check == 'model running' # Assert model created and saved assert_obj_exists(Model, {'project': project}) model = Model.objects.get(project=project) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), 'project_' + str(project.pk) + '_training_' + str(initial_training_set.set_number) + '.pkl') # Assert predictions created predictions = DataPrediction.objects.filter(data__project=project) assert len(predictions) == Data.objects.filter( project=project, labelers=None).count() * project.labels.count() # Assert queue filled and redis sycned batch_size = project.batch_size q = project.queue_set.get(type="normal") q_irr = project.queue_set.get(type="irr") assert (q.data.count() + q_irr.data.count()) == batch_size assert_redis_matches_db(test_redis) num_coders = len(project.projectpermissions_set.all()) + 1 new_queue_length = find_queue_length(batch_size, num_coders) assert q.length == new_queue_length # Assert least confident in queue data_list = get_ordered_data(test_queue.data.all(), 'least confident') previous_lc = data_list[0].datauncertainty_set.get().least_confident for datum in data_list: assert len(datum.datalabel_set.all()) == 0 assert_obj_exists(DataUncertainty, {'data': datum}) assert datum.datauncertainty_set.get().least_confident <= previous_lc previous_lc = datum.datauncertainty_set.get().least_confident assert (DataQueue.objects.filter(queue=test_queue).count() + DataQueue.objects.filter(queue=test_irr_queue_labeled).count() ) == batch_size # Assert new training set assert project.get_current_training_set() != initial_training_set assert project.get_current_training_set( ).set_number == initial_training_set.set_number + 1
def label_admin_label(request, data_pk): """This is called when an admin manually labels a datum on the admin annotation page. It labels a single datum with the given label and profile, with null as the time. Args: request: The POST request data_pk: Primary key of the data Returns: {} """ datum = Data.objects.get(pk=data_pk) project = datum.project label = Label.objects.get(pk=request.data["labelID"]) profile = request.user.profile response = {} current_training_set = project.get_current_training_set() with transaction.atomic(): queue = project.queue_set.get(type="admin") DataLabel.objects.create( data=datum, label=label, profile=profile, training_set=current_training_set, time_to_label=None, timestamp=timezone.now(), ) DataQueue.objects.filter(data=datum, queue=queue).delete() # update redis settings.REDIS.srem(redis_serialize_set(queue), redis_serialize_data(datum)) # make sure the data is no longer irr if datum.irr_ind: Data.objects.filter(pk=datum.pk).update(irr_ind=False) # NOTE: this checks if the model needs to be triggered, but not if the # queues need to be refilled. This is because for something to be in the # admin queue, annotate or skip would have already checked for an empty queue check_and_trigger_model(datum) return Response(response)
def skip_data(request, data_pk): """Take a datum that is in the assigneddata queue for that user and place it in the admin queue. Remove it from the assignedData queue. Args: request: The POST request data_pk: Primary key of the data Returns: {} """ data = Data.objects.get(pk=data_pk) profile = request.user.profile project = data.project response = {} # if the data is IRR or processed IRR, dont add to admin queue yet num_history = IRRLog.objects.filter(data=data).count() if RecycleBin.objects.filter(data=data).count() > 0: assignment = AssignedData.objects.get(data=data, profile=profile) assignment.delete() elif data.irr_ind or num_history > 0: # unassign the skipped item assignment = AssignedData.objects.get(data=data, profile=profile) assignment.delete() # log the data and check IRR but don't put in admin queue yet IRRLog.objects.create(data=data, profile=profile, label=None, timestamp=timezone.now()) # if the IRR history has more than the needed number of labels , it is # already processed so don't do anything else if num_history <= project.num_users_irr: process_irr_label(data, None) else: # the data is not IRR so treat it as normal move_skipped_to_admin_queue(data, profile, project) # for all data, check if we need to refill queue check_and_trigger_model(data, profile) return Response(response)
def annotate_data(request, data_pk): """Annotate a single datum which is in the assigneddata queue given the user, data_id, and label_id. This will remove it from assigneddata, remove it from dataqueue and add it to labeleddata. Also check if project is ready to have model run, if so start that process. Args: request: The POST request data_pk: Primary key of the data Returns: {} """ data = Data.objects.get(pk=data_pk) project = data.project profile = request.user.profile response = {} label = Label.objects.get(pk=request.data['labelID']) labeling_time = request.data['labeling_time'] num_history = IRRLog.objects.filter(data=data).count() if RecycleBin.objects.filter(data=data).count() > 0: # this data is no longer in use. delete it assignment = AssignedData.objects.get(data=data, profile=profile) assignment.delete() elif num_history >= project.num_users_irr: # if the IRR history has more than the needed number of labels , it is # already processed so just add this label to the history. IRRLog.objects.create(data=data, profile=profile, label=label, timestamp=timezone.now()) assignment = AssignedData.objects.get(data=data, profile=profile) assignment.delete() else: label_data(label, data, profile, labeling_time) if data.irr_ind: # if it is reliability data, run processing step process_irr_label(data, label) # for all data, check if we need to refill queue check_and_trigger_model(data, profile) return Response(response)
def test_g_naivebayes_classifier( setup_celery, test_project_gnb_data_tfidf, test_gnb_labels, test_gnb_queue_list, test_profile, test_redis, tmpdir, settings, ): """This tests that a project with the Gaussian Naiive Bayes classifier can successfully train and give predictions for a model.""" normal_queue, admin_queue, irr_queue = test_gnb_queue_list labels = test_gnb_labels project = test_project_gnb_data_tfidf active_l = project.learning_method batch_size = project.batch_size initial_training_set = project.get_current_training_set() model_path_temp = tmpdir.listdir()[0].mkdir("model_pickles") settings.MODEL_PICKLE_PATH = str(model_path_temp) assert project.classifier == "gnb" assert active_l == "least confident" fill_queue(normal_queue, "random") assert DataQueue.objects.filter(queue=normal_queue).count() == batch_size for i in range(batch_size): datum = assign_datum(test_profile, project) label_data(labels[i % 3], datum, test_profile, 3) ret_str = check_and_trigger_model(datum) assert ret_str == "model running" # Assert model created and saved assert_obj_exists(Model, {"project": project}) model = Model.objects.get(project=project) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), "project_" + str(project.pk) + "_training_" + str(initial_training_set.set_number) + ".pkl", ) # Assert predictions created predictions = DataPrediction.objects.filter(data__project=project) assert ( len(predictions) == Data.objects.filter(project=project, labelers=None).count() * project.labels.count() )
def test_check_and_trigger_batched_success(setup_celery, test_project_labeled_and_tfidf, test_queue_labeled, test_irr_queue_labeled, test_redis, tmpdir, settings): project = test_project_labeled_and_tfidf test_queue = test_queue_labeled initial_training_set = project.get_current_training_set() initial_queue_size = test_queue.length model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) datum = DataLabel.objects.filter(data__project=project).first().data check = check_and_trigger_model(datum) assert check == 'model running' # Assert model created and saved assert_obj_exists(Model, {'project': project}) model = Model.objects.get(project=project) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), 'project_' + str(project.pk) + '_training_' + str(initial_training_set.set_number) + '.pkl') # Assert predictions created predictions = DataPrediction.objects.filter(data__project=project) assert len(predictions) == Data.objects.filter( project=project, labelers=None).count() * project.labels.count() # Assert queue filled and redis sycned assert (test_queue.data.count() + test_irr_queue_labeled.data.count()) == test_queue.length assert_redis_matches_db(test_redis) assert test_queue.length == initial_queue_size # Assert least confident in queue data_list = get_ordered_data(test_queue.data.all(), 'least confident') previous_lc = data_list[0].datauncertainty_set.get().least_confident for datum in data_list: assert len(datum.datalabel_set.all()) == 0 assert_obj_exists(DataUncertainty, {'data': datum}) assert datum.datauncertainty_set.get().least_confident <= previous_lc previous_lc = datum.datauncertainty_set.get().least_confident assert (DataQueue.objects.filter(queue=test_queue).count() + DataQueue.objects.filter(queue=test_irr_queue_labeled).count() ) == TEST_QUEUE_LEN # Assert new training set assert project.get_current_training_set() != initial_training_set assert project.get_current_training_set( ).set_number == initial_training_set.set_number + 1
def test_randomforest_classifier(setup_celery, test_project_randomforest_data_tfidf, test_randomforest_labels, test_randomforest_queue_list, test_profile, test_redis, tmpdir, settings): ''' This tests that a project with the random forest classifier can successfully train and give predictions for a model ''' normal_queue, admin_queue, irr_queue = test_randomforest_queue_list labels = test_randomforest_labels project = test_project_randomforest_data_tfidf active_l = project.learning_method batch_size = project.batch_size initial_training_set = project.get_current_training_set() model_path_temp = tmpdir.listdir()[0].mkdir('model_pickles') settings.MODEL_PICKLE_PATH = str(model_path_temp) assert project.classifier == "random forest" assert active_l == 'least confident' fill_queue(normal_queue, 'random') assert DataQueue.objects.filter(queue=normal_queue).count() == batch_size for i in range(batch_size): datum = assign_datum(test_profile, project) label_data(labels[i % 3], datum, test_profile, 3) ret_str = check_and_trigger_model(datum) assert ret_str == 'model running' # Assert model created and saved assert_obj_exists(Model, {'project': project}) model = Model.objects.get(project=project) assert os.path.isfile(model.pickle_path) assert model.pickle_path == os.path.join( str(model_path_temp), 'project_' + str(project.pk) + '_training_' + str(initial_training_set.set_number) + '.pkl') # Assert predictions created predictions = DataPrediction.objects.filter(data__project=project) assert len(predictions) == Data.objects.filter( project=project, labelers=None).count() * project.labels.count()
def test_check_and_trigger_model_first_labeled( setup_celery, test_project_data, test_labels, test_queue, test_profile ): initial_training_set = test_project_data.get_current_training_set() fill_queue(test_queue, orderby="random") datum = assign_datum(test_profile, test_queue.project) test_label = test_labels[0] label_data(test_label, datum, test_profile, 3) check = check_and_trigger_model(datum) assert check == "no trigger" assert test_project_data.get_current_training_set() == initial_training_set assert test_project_data.model_set.count() == 0 assert DataPrediction.objects.filter(data__project=test_project_data).count() == 0 assert DataUncertainty.objects.filter(data__project=test_project_data).count() == 0 assert DataQueue.objects.filter(queue=test_queue).count() == TEST_QUEUE_LEN - 1
def send_check_and_trigger_model_task(project_pk): from core.models import Data from core.utils.utils_model import check_and_trigger_model datum = Data.objects.filter(project=project_pk).first() check_and_trigger_model(datum)