示例#1
0
 def get_worker(self) -> Worker:
     """
     Return the worker that is using this agent for a task
     """
     if self._worker is None:
         self._worker = Worker.get(self.db, self.worker_id)
     return self._worker
    def test_worker(self) -> None:
        """Test creation and querying of workers"""
        assert self.db is not None, "No db initialized"
        db: MephistoDB = self.db

        # Check creation and retrieval of a worker
        worker_name = "test_worker"
        provider_type = PROVIDER_TYPE
        worker_id = db.new_worker(worker_name, provider_type)
        self.assertIsNotNone(worker_id)
        self.assertTrue(isinstance(worker_id, str))
        worker_row = db.get_worker(worker_id)
        self.assertEqual(worker_row["worker_name"], worker_name)

        worker = Worker.get(db, worker_id)
        self.assertEqual(worker.worker_name, worker_name)

        # Check finding for workers
        workers = db.find_workers()
        self.assertEqual(len(workers), 1)
        self.assertTrue(isinstance(workers[0], Worker))
        self.assertEqual(workers[0].db_id, worker_id)
        self.assertEqual(workers[0].worker_name, worker_name)

        # Check finding for specific workers
        workers = db.find_workers(worker_name=worker_name)
        self.assertEqual(len(workers), 1)
        self.assertTrue(isinstance(workers[0], Worker))
        self.assertEqual(workers[0].db_id, worker_id)
        self.assertEqual(workers[0].worker_name, worker_name)

        workers = db.find_workers(worker_name="fake_name")
        self.assertEqual(len(workers), 0)
示例#3
0
    def _extract_response_by_index(
        self, unit_details: Dict[str, Any], idx: int
    ) -> Optional[Dict[str, Any]]:
        """
        Extract response data from task data.

        :param unit_details:
            full extracted data from a unit
        :param idx:
            index of the singular evaluation within unit_details to extract

        :return response:
            Formatted worker's response data from the task
        """
        task_data = unit_details['data'][idx]
        response: Dict[str, Any] = {
            'run_id': self.run_id,
            'worker': unit_details['worker_id'],
            'worker_name': Worker.get(
                self.mephisto_db, unit_details['worker_id']
            ).worker_name,
            'time_taken': unit_details['task_end'] - unit_details['task_start'],
            'question': task_data['task_specs']['question'],
            'unit_id': unit_details['unit_id'],
            'task_start': unit_details['task_start'],
        }
        onboarding = task_data['task_specs'].get('is_onboarding', False)
        if 'speakerChoice' not in task_data or task_data['speakerChoice'] == '':
            print('speakerChoice not in task data!')
            return
        choice = task_data['speakerChoice']
        if onboarding:
            response['correct'] = choice == task_data['pairing_dict']['correct_answer']
        else:
            response['correct'] = -1

        speakers_to_eval = sorted(task_data["pairing_dict"]["speakers_to_eval"])
        response.update(
            {
                'winner': choice,
                'loser': speakers_to_eval[1 - (speakers_to_eval.index(choice))],
                'eval_choice_0': speakers_to_eval[0],
                'eval_choice_1': speakers_to_eval[1],
                'reason': task_data['textReason'],
                'is_onboarding': onboarding,
                'matchup': f"{'__vs__'.join(speakers_to_eval)}",
                'pairing_id': task_data['pair_id'],
            }
        )

        # If it exists, add in which checkboxes of possible reasons the Turkers checked
        if len(task_data.get('speakerReasons', {})) > 0:
            response.update(
                {
                    self.checkbox_prefix + reason: checked
                    for reason, checked in task_data['speakerReasons'].items()
                }
            )
        return response
示例#4
0
 def get_workers_with_qualification(
         self, qualification_name: str) -> List[Worker]:
     """
     Returns a list of 'Worker's for workers who are qualified wrt `qualification_name`.
     """
     qual_list = self.db.find_qualifications(
         qualification_name=qualification_name)
     assert len(qual_list
                ) >= 1, f"No qualification found named {qualification_name}"
     qualification_id = qual_list[0].db_id
     qualifieds = self.db.check_granted_qualifications(
         qualification_id=qualification_id, value=1)
     return [Worker.get(self.db, qual.worker_id) for qual in qualifieds]
    def test_worker(self) -> None:
        """Ensure we can query and use a worker"""
        db: MephistoDB = self.db
        requester = self.get_test_requester()
        WorkerClass = self.CrowdProviderClass.WorkerClass
        test_worker = WorkerClass.new(db, self.get_test_worker_name())
        test_worker_2 = Worker.get(db, test_worker.db_id)
        self.assertEqual(
            test_worker.worker_name,
            test_worker_2.worker_name,
            "Worker gotten from db not same as first init",
        )

        # Ensure blocking is doable
        test_worker.block_worker("Test reason", requester=requester)
        self.assertTrue(test_worker.is_blocked(requester))
        test_worker.unblock_worker("Test reason", requester=requester)
        self.assertFalse(test_worker.is_blocked(requester))
示例#6
0
    def test_create_and_find_worker(self) -> None:
        """Ensure we can find a worker by MTurk id"""
        db = self.db
        TEST_MTURK_WORKER_ID = "ABCDEFGHIJ"

        test_worker = MTurkWorker.new(db, TEST_MTURK_WORKER_ID)
        test_worker_2 = Worker.get(db, test_worker.db_id)
        self.assertEqual(
            test_worker.worker_name,
            test_worker_2.worker_name,
            "Worker gotten from db not same as first init",
        )

        test_worker_3 = MTurkWorker.get_from_mturk_worker_id(db, TEST_MTURK_WORKER_ID)
        assert test_worker_3 is not None
        self.assertEqual(
            test_worker.worker_name,
            test_worker_3.worker_name,
            "Worker gotten from db not same as first init",
        )

        failed_worker = MTurkWorker.get_from_mturk_worker_id(db, "FAKE_ID")
        self.assertIsNone(failed_worker, f"Found worker {failed_worker} from a fake id")
def format_for_printing_data(data):
    global db
    # Custom tasks can define methods for how to display their data in a relevant way
    worker_name = Worker.get(db, data["worker_id"]).worker_name
    contents = data["data"]
    duration = contents["times"]["task_end"] - contents["times"]["task_start"]
    metadata_string = (
        f"Worker: {worker_name}\nUnit: {data['unit_id']}\n"
        f"Duration: {int(duration)}\nStatus: {data['status']}\n")

    inputs = contents["inputs"]
    inputs_string = f"Character: {inputs['character_name']}\nDescription: {inputs['character_description']}\n"

    outputs = contents["outputs"]
    output_string = f"   Rating: {outputs['rating']}\n"
    found_files = outputs.get("files")
    if found_files is not None:
        file_dir = Unit.get(
            db, data["unit_id"]).get_assigned_agent().get_data_dir()
        output_string += f"   Files: {found_files}\n"
        output_string += f"   File directory {file_dir}\n"
    else:
        output_string += f"   Files: No files attached\n"
    return f"-------------------\n{metadata_string}{inputs_string}{output_string}"
    def test_worker_fails(self) -> None:
        """Ensure workers fail to be created or loaded under failure conditions"""
        assert self.db is not None, "No db initialized"
        db: MephistoDB = self.db

        # Cant get non-existent entry
        with self.assertRaises(EntryDoesNotExistException):
            worker = Worker.get(db, self.get_fake_id("Worker"))

        worker_name = "test_worker"
        provider_type = PROVIDER_TYPE
        worker_id = db.new_worker(worker_name, provider_type)

        # Can't create same worker again
        with self.assertRaises(EntryAlreadyExistsException):
            worker_id = db.new_worker(worker_name, provider_type)

        # Can't use no name
        with self.assertRaises(MephistoDBException):
            worker_id = db.new_worker("", provider_type)

        # Ensure no workers were created
        workers = db.find_workers()
        self.assertEqual(len(workers), 1)
示例#9
0
 def make_registered_worker(self, worker_name) -> Worker:
     worker_id = self.db.new_worker(worker_name + "_sandbox", "mock")
     return Worker.get(self.db, worker_id)
示例#10
0
def run_examine_by_worker(
    db: "MephistoDB",
    format_data_for_printing: Callable[[Dict[str, Any]], str],
    task_name: Optional[str] = None,
    block_qualification: Optional[str] = None,
    approve_qualification: Optional[str] = None,
):
    """
    Basic script for reviewing work, grouped by worker for convenience. First gets
    the required information to run a review, then
    """
    data_browser = DataBrowser(db=db)

    # Get initial arguments
    if task_name is None:
        task_name, block_qualification, approve_qualification = prompt_for_options(
            task_name, block_qualification, approve_qualification
        )

    tasks = db.find_tasks(task_name=task_name)
    assert len(tasks) >= 1, f"No task found under name {task_name}"

    print(
        "You will be reviewing actual tasks with this flow. Tasks that you either Accept or Pass "
        "will be paid out to the worker, while rejected tasks will not. Passed tasks will be "
        "specially marked such that you can leave them out of your dataset. \n"
        "You may enter the option in caps to apply it to the rest of the units for a given worker."
    )
    if block_qualification is not None:
        created_block_qual = find_or_create_qualification(db, block_qualification)
        print(
            "When you pass or reject a task, the script gives you an option to disqualify the worker "
            "from future tasks by assigning a qualification. If provided, this worker will no "
            "longer be able to work on tasks where the set --block-qualification shares the same name "
            f"you provided above: {block_qualification}\n"
        )
    if approve_qualification is not None:
        created_approve_qual = find_or_create_qualification(db, approve_qualification)
        print(
            "You may use this script to establish a qualified worker pool by granting the provided "
            f"approve qualification {approve_qualification} to workers you think understand the task "
            "well. This will be provided as an option for workers you (A)pprove all on. "
            "Future tasks can use this qual as a required qualification, as described in the "
            "common qualification flows document."
        )
    print(
        "**************\n"
        "You should only reject tasks when it is clear the worker has acted in bad faith, and "
        "didn't actually do the task. Prefer to pass on tasks that were misunderstandings.\n"
        "**************\n"
    )

    units = data_browser.get_units_for_task_name(task_name)

    others = [u for u in units if u.get_status() != "completed"]
    units = [u for u in units if u.get_status() == "completed"]
    reviews_left = len(units)
    previous_work_by_worker = get_worker_stats(others)

    # Determine allowed options
    options = ["a", "p", "r"]
    options_string = "Do you want to accept this work? (a)ccept, (r)eject, (p)ass:"

    units_by_worker: Dict[str, List["Unit"]] = {}

    for u in units:
        w_id = u.worker_id
        if w_id not in units_by_worker:
            units_by_worker[w_id] = []
        units_by_worker[w_id].append(u)

    # Run the review
    for w_id, w_units in units_by_worker.items():
        worker = Worker.get(db, w_id)
        worker_name = worker.worker_name
        apply_all_decision = None
        reason = None
        for idx, unit in enumerate(w_units):

            print(
                f"Reviewing for worker {worker_name}, ({idx+1}/{len(w_units)}), "
                f"Previous {format_worker_stats(w_id, previous_work_by_worker)} "
                f"(total remaining: {reviews_left})"
            )
            reviews_left -= 1
            print(format_data_for_printing(data_browser.get_data_from_unit(unit)))
            if apply_all_decision is not None:
                decision = apply_all_decision
            else:
                decision = input(
                    "Do you want to accept this work? (a)ccept, (r)eject, (p)ass: "
                )
            while decision.lower() not in options:
                decision = input(
                    "Decision must be one of a, p, r. Use CAPS to apply to all remaining for worker: "
                )

            agent = unit.get_assigned_agent()
            assert (
                agent is not None
            ), f"Can't make decision on None agent... issue with {unit}"
            if decision.lower() == "a":
                agent.approve_work()
                if decision == "A" and approve_qualification is not None:
                    should_special_qualify = input(
                        "Do you want to approve qualify this worker? (y)es/(n)o: "
                    )
                    if should_special_qualify.lower() in ["y", "yes"]:
                        worker.grant_qualification(approve_qualification, 1)
            elif decision.lower() == "p":
                agent.soft_reject_work()
                if apply_all_decision is None and block_qualification is not None:
                    should_soft_block = input(
                        "Do you want to soft block this worker? (y)es/(n)o: "
                    )
                    if should_soft_block.lower() in ["y", "yes"]:
                        worker.grant_qualification(block_qualification, 1)
            else:  # decision = 'r'
                if apply_all_decision is None:
                    reason = input("Why are you rejecting this work? ")
                    should_block = input(
                        "Do you want to hard block this worker? (y)es/(n)o: "
                    )
                    if should_block.lower() in ["y", "yes"]:
                        block_reason = input("Why permanently block this worker? ")
                        worker.block_worker(block_reason)
                agent.reject_work(reason)

            if decision.lower() != decision:
                apply_all_decision = decision.lower()
示例#11
0
 def get_named_test_worker(self, worker_name: str) -> Worker:
     """Create a test worker with the given worker name"""
     worker_id = self.db.new_worker(worker_name, "mock")
     return Worker.get(self.db, worker_id)
示例#12
0
    def compile_results(self) -> pd.DataFrame:

        # Load task data
        logging.info('Retrieving task data from Mephisto.')
        task_units_data = self.get_task_data()
        logging.info(
            f'Data for {len(task_units_data)} units loaded successfully.')

        num_convos_with_no_save_data = 0
        num_wrong_status_convos = 0
        num_complete_convos = 0

        unacceptable_task_units = []
        unacceptable_worker_ids = []
        conversation_idx = 0
        conversation_dfs = []

        for task_unit in task_units_data:

            worker_id = task_unit['worker_id']
            assignment_id = task_unit['assignment_id']

            # Skipping this conversation if save data is not found or the status is
            # invalid
            if task_unit['data']['save_data'] is None:
                logging.info('Found a task unit with no save data! Skipping.')
                num_convos_with_no_save_data += 1
                continue
            elif task_unit['status'] not in ['completed', 'approved']:
                logging.info(
                    f'Found a HIT with the status "{task_unit["status"]}"!.'
                    f'Skipping.')
                num_wrong_status_convos += 1
                continue
            else:
                num_complete_convos += 1

            # Extract out useful conversation-level data
            custom_data = task_unit['data']['save_data']['custom_data']
            mturk_worker_id = Worker.get(self.get_mephisto_db(),
                                         worker_id).worker_name
            task_start = datetime.utcfromtimestamp(task_unit['task_start'])
            task_end = datetime.utcfromtimestamp(task_unit['task_end'])
            info_dict = {
                ('worker_id', ''): worker_id,
                ('mturk_worker_id', ''): mturk_worker_id,
                ('unit_id', ''): task_unit['unit_id'],
                ('assignment_id', ''): assignment_id,
                ('conversation_idx', ''): conversation_idx,
                ('date', ''): task_start.strftime('%Y-%m-%d'),
                ('completion_time', ''):
                (task_end - task_start).total_seconds(),
            }

            # Check that the conversation consists of pairs of comments between
            # Speaker 1 and Speaker 2, with Speaker 1 speaking first
            assert 'final_rating' in task_unit['data']['messages'][-1][
                'task_data']
            convo_messages = [m for m in task_unit['data']['messages'][:-1]]
            # The final message is just a final rating
            assert all([
                message['id'] == 'Speaker 2' if message_idx %
                2 else 'Speaker 1'
                for message_idx, message in enumerate(convo_messages)
            ])
            messages_1 = [m for m in convo_messages if m['id'] == 'Speaker 1']
            messages_2 = [m for m in convo_messages if m['id'] == 'Speaker 2']
            assert len(messages_1) + len(messages_2) == len(convo_messages)

            # Determine whether the HIT contains unacceptable messages. (We do this for
            # every HIT, even if acceptability violation info was already saved, because
            # the violation criteria may have changed since the HIT was collected.)
            utterances_1 = [m['text'] for m in messages_1]
            assert utterances_1[0] == 'Hi!', (
                'This script assumes that the first human message is "Hi!", which is '
                'set by default and cannot be changed by the crowdsourcing worker.'
            )
            acceptability_violations = self.acceptability_checker.check_messages(
                messages=utterances_1[1:],  # Don't use the initial "Hi!"
                is_worker_0=True,
                violation_types=self.acceptability_checker.ALL_VIOLATION_TYPES,
            )
            # Here, "worker 0" refers to Speaker 1, because we mix 0- and 1-indexing
            if acceptability_violations != '':
                logging.info(
                    f'Conversation fails acceptability checks with a violation of '
                    f'"{acceptability_violations}", given the following utterances: '
                    f'{utterances_1[1:]}. Skipping.')
                unacceptable_task_units.append(task_unit)
                assert (
                    mturk_worker_id is not None
                ), "MTurk worker ID cannot be determined for this unacceptable conversation!"
                unacceptable_worker_ids.append(mturk_worker_id)
                continue

            # Ignore the conversation if ratings for all turns are the same, because
            # it's somewhat implausible that *all* turns in a conversation should garner
            # the same rating of engagingness, humanness, interestingness, or none.
            # (However, don't put these workers on the "unacceptable worker IDs" list,
            # to give them a little bit of the benefit of the doubt: i.e. maybe the
            # worker just didn't try hard enough to find which responses were more
            # engaging, etc. than others, but that doesn't mean that all of their HITs
            # across all evals are bad and should be removed.)
            if self.filter_uniform_hits:
                annotations = [
                    m['task_data']['problem_data_for_prior_message']
                    for m in task_unit['data']['messages']
                    if 'problem_data_for_prior_message' in m.get(
                        'task_data', {})
                ]
                hashable_annotations = [
                    tuple(a[key] for key in sorted(a.keys()))
                    for a in annotations
                ]
                unique_annotations = set(hashable_annotations)
                if len(unique_annotations) < 1:
                    raise ValueError('No annotations found for this HIT!')
                elif len(unique_annotations) == 1:
                    logging.info(
                        f'All model responses in the conversation received the same '
                        f'annotation: {hashable_annotations[0]}. Skipping.')
                    unacceptable_task_units.append(task_unit)
                    continue

            single_turn_dicts = []

            # Compile personas and previous utterances
            text_parts = []
            if custom_data['personas'] is not None and len(
                    custom_data['personas']) > 0:
                assert len(custom_data['personas']) == 2
                text_parts += [
                    'HUMAN PERSONA: ' + ' '.join(custom_data['personas'][0]),
                    'BOT PERSONA: ' + ' '.join(custom_data['personas'][1]),
                ]
            if (custom_data['additional_context'] is not None
                    and len(custom_data['additional_context']) > 0):
                text_parts.append('ADDITIONAL CONTEXT: ' +
                                  custom_data['additional_context'])
            single_turn_dicts.append({
                **info_dict, ('context', ''):
                ' '.join(text_parts)
            })

            # Loop over conversation turns
            turns_per_speaker = defaultdict(int)
            for message in task_unit['data']['messages']:
                if 'text' in message:

                    speaker_id = message['id']

                    # Add in annotation results, if they exist
                    if 'problem_data_for_prior_message' in message.get(
                            'task_data', {}):
                        bucket_data = {
                            ('annotation_bucket', bucket): value
                            for bucket, value in message['task_data']
                            ['problem_data_for_prior_message'].items()
                        }
                    else:
                        bucket_data = {}

                    # Add in results from the final rating(s), if they exist
                    if 'final_rating' in message.get('task_data', {}):
                        ratings = message['task_data']['final_rating'].split(
                            '|')
                        final_rating_data = {
                            ('final_rating', str(idx)): value
                            for idx, value in enumerate(ratings)
                        }
                    else:
                        final_rating_data = {}

                    turns_per_speaker[speaker_id] += 1

                    single_turn_dicts.append({
                        **info_dict,
                        ('speaker_id', ''):
                        speaker_id,
                        ('speaker_turn_idx', ''):
                        turns_per_speaker[speaker_id],
                        ('text', ''):
                        message['text'].replace('\n', '__newline__'),
                        **bucket_data,
                        **final_rating_data,
                    })

            # Adding the full conversation to the list of conversations
            single_turn_series = [
                pd.Series(dict_).to_frame().transpose()
                for dict_ in single_turn_dicts
            ]
            single_convo_df = pd.concat(single_turn_series, axis=0, sort=False)
            conversation_dfs.append(single_convo_df)
            conversation_idx += 1

        logging.info(
            f'{num_convos_with_no_save_data:d} conversations found with no save data.'
        )
        logging.info(
            f'{num_wrong_status_convos:d} conversations found with the wrong status.'
        )
        logging.info(f'{num_complete_convos:d} complete conversations found:')
        logging.info(
            f'\t{len(unacceptable_task_units):d} unacceptable conversations.')
        logging.info(f'\t{len(conversation_dfs):d} acceptable conversations.')

        # # Compile results across all conversations

        if len(conversation_dfs) == 0:
            raise ValueError('No acceptable conversations found!')
        unordered_conversation_df = pd.concat(conversation_dfs, axis=0)
        initial_ordered_columns = list(info_dict.keys()) + [
            ('context', ''),
            ('speaker_id', ''),
            ('speaker_turn_idx', ''),
            ('text', ''),
        ]
        all_ordered_columns = initial_ordered_columns + [
            col for col in unordered_conversation_df.columns
            if col not in initial_ordered_columns
        ]
        conversation_df = unordered_conversation_df[all_ordered_columns]
        # TODO: is there a less hacky way than this, which relies on the most recent
        #  value of `info_dict`, to put the columns back into the right order?

        # # Calculate and save auxiliary stats

        logging.info(
            f'Saving MTurk IDs of workers with unacceptable conversations to '
            f'{self.unacceptable_worker_ids_path}.')
        with open(self.unacceptable_worker_ids_path, 'w') as f:
            for worker_id in unacceptable_worker_ids:
                f.write(worker_id + '\n')

        # Calculate rates of selecting various annotation buckets
        annotation_bucket_df = conversation_df['annotation_bucket'].dropna(
            axis=0, how='any')
        if annotation_bucket_df.isna().sum().sum() > 0:
            raise ValueError(
                'There is at least one row in which only partial annotation bucket data exists!'
            )
        annotation_selection_rate_df = annotation_bucket_df.mean().to_frame(
            'selection_rate')
        annotation_selection_rate_df.to_csv(
            self.annotation_selection_rate_path)
        logging.info(
            f'Annotation bucket selection rates saved to {self.annotation_selection_rate_path}.'
        )
        output_strings = [
            f'{series.name}: {100*series["selection_rate"]:0.0f}%'
            for _, series in annotation_selection_rate_df.iterrows()
        ]
        logging.info('Annotation bucket selection rates:\n' +
                     '\n'.join(output_strings))

        # Calculate Likert score stats
        final_rating_df = conversation_df['final_rating'].dropna(axis=0,
                                                                 how='any')
        if final_rating_df.isna().sum().sum() > 0:
            raise ValueError(
                'There is at least one row in which only partial final rating data exists!'
            )
        likert_score_stat_df = final_rating_df.astype(int).describe()
        likert_score_stat_df.to_csv(self.likert_score_stat_path)
        logging.info(
            f'Likert score statistics saved to {self.likert_score_stat_path}.')
        logging.info(
            f'Mean Likert scores:\n{likert_score_stat_df.loc["mean"]}')

        return conversation_df