示例#1
0
def extractquestions(xml_dir, output_path):
    """Extract questions from XML_DIR and write to OUTPUT_PATH.

    Extract all unique subject-question-answer triples from a batch of
    20 Questions HITs. XML_DIR should be the XML directory of one of
    the 20 Questions HIT batches, extracted with AMTI. OUTPUT_PATH is
    the location to which the data will be written.
    """
    # submissions : the form data submitted from the twentyquestions
    #   HITs as a list of dictionaries mapping the question identifiers
    #   to the free text, i.e.:
    #
    #     [{'gameRoomJson': game_room_json_string}, ...]
    #
    submissions = _utils.extract_xml_dir(xml_dir)

    # extract the rows from the game room jsons
    row_strs = set()
    for submission in submissions:
        data = json.loads(submission['gameRoomJson'])

        # generate all the subject-question-answer triples created
        # during the game.
        subject = data['game']['round']['subject']
        for questionAndAnswer in data['game']['round']['questionAndAnswers']:
            # use an OrderedDict so the keys appear in the right order
            # in the JSON.
            row = collections.OrderedDict([
                ('subject', subject),
                ('question', questionAndAnswer['question']['questionText']),
                ('answer', questionAndAnswer['answer']['answerValue'])
            ])
            row_strs.add(json.dumps(row))

    # write out the data
    with click.open_file(output_path, 'w') as output_file:
        output_file.write('\n'.join(sorted(row_strs)))
示例#2
0
def extractgames(xml_dir, output_path):
    """Extract games from XML_DIR and write to OUTPUT_PATH.

    Extract the 20 Questions game data from a batch of 20 Questions
    HITs. XML_DIR should be the XML directory of one of the 20 Questions
    HIT batches, extracted with AMTI. OUTPUT_PATH is the location to
    which the data will be written.
    """
    # submissions : the form data submitted from the twentyquestions
    # HITs as a list of dictionaries mapping the question identifiers to
    # the free text, i.e.:
    #
    #     [{'gameRoomJson': game_room_json_string}, ...]
    #
    submissions = _utils.extract_xml_dir(xml_dir)

    # deduplicate the games because each crowdworker who participates in
    # the game submits a copy of the game data.
    game_jsons = set(
        [submission['gameRoomJson'] for submission in submissions])

    # write out the data
    with click.open_file(output_path, 'w') as output_file:
        output_file.write('\n'.join(game_jsons))
示例#3
0
def extractquality(xml_dir, output_path):
    """Extract quality labels from XML_DIR and write to OUTPUT_PATH.

    Extract the quality annotations from a batch of the quality control
    HITs. XML_DIR should be an XML directory extracted with AMTI.
    OUTPUT_PATH is the location to which the data will be written in
    JSON Lines format. High quality questions will be marked with the
    "high_quality" attribute as True, where high quality means 2 of the
    3 workers rated it 'good'. Note, this script assumes that all
    the batches had all 3 assignments completed. If there are not 3
    completed assignments for each HIT, the script will throw an error.
    """
    # submissions : the form data submitted from the quality control
    # HITs as a list of dictionaries mapping the question identifiers to
    # the free text, i.e.:
    #
    #     [
    #       {
    #         'attribute-idx': attribute_value,
    #         ...
    #       },
    #       ...
    #     ]
    #
    # See the data for individual attributes and values. The index (idx)
    # is used because each HIT had the worker label multiple instances
    # for efficiency purposes.
    submissions = _utils.extract_xml_dir(xml_dir)

    # decode the data from the ``"attribute-idx": value`` style to the
    # individual rows.
    rows = _utils.decode_attribute_idx_data(submissions)

    # aggregate all the quality labels for each instance, since we had
    # multiple assignments / workers per instance.
    key_to_qualities = collections.defaultdict(list)
    for row in rows:
        key = _utils.key(row, KEY_SCHEMA.keys())
        key_to_qualities[key].append(row['quality'])

    # create the new rows by processing the aggregated quality labels
    new_row_strs = []
    for key, qualities in key_to_qualities.items():
        assert len(qualities) == EXPECTED_NUM_QUALITIES, (
            f'{key} only has {len(qualities)} quality labels.'
            f' It should have exactly {EXPECTED_NUM_QUALITIES}')

        # create the new row

        # use an OrderedDict so that the keys appear in the right order
        # in the JSON.
        new_row = collections.OrderedDict([
            (attribute, as_type(value))
            for (attribute, as_type), value in zip(KEY_SCHEMA.items(), key)
        ])

        # compute new attributes to add
        score = sum([QUALITY_TO_BIT[quality] for quality in qualities])
        high_quality = score >= MIN_SCORE

        # add the new attributes
        new_row['quality_labels'] = qualities
        new_row['score'] = score
        new_row['high_quality'] = high_quality

        new_row_strs.append(json.dumps(new_row))

    # write out the data
    with click.open_file(output_path, 'w') as output_file:
        output_file.write('\n'.join(sorted(new_row_strs)))
示例#4
0
def extracttypes(xml_dir, output_path):
    """Extract commonsense types from XML_DIR and write to OUTPUT_PATH.

    Extract the commonsense types for each subject-question pair from a
    batch of the commonsense type HITs. XML_DIR should be an XML
    directory extracted with AMTI. OUTPUT_PATH is the location to which
    the data will be written in a JSON Lines format. Each instance will
    have a "types" attribute that is a dictionary mapping each type to a
    true or false label. Additionally, each instance will also have a
    "type_scores" attribute which gives the raw count of votes for each
    type.
    """
    # submissions : the form data submitted from the commonsense type
    # HITs as a list of dictionaries mapping the question identifiers to
    # the free text, i.e.:
    #
    #     [
    #       {
    #         'attribute-idx': attribute_value,
    #         ...
    #       },
    #       ...
    #     ]
    #
    # See the data for individual attributes and values. The index (idx)
    # is used because each HIT had the worker label multiple instances
    # for efficiency purposes.
    submissions = _utils.extract_xml_dir(xml_dir)

    # decode the data from the ``"attribute-idx": value`` style to the
    # individual rows.
    rows = _utils.decode_attribute_idx_data(submissions)

    # aggregate all the type labels for each instance, since we had
    # multiple assignments / workers per instance.
    key_to_type_scores = collections.defaultdict(
        lambda: {
            'total_votes': 0,
            'ontological': 0,
            'capability': 0,
            'location': 0,
            'physical': 0,
            'non-physical': 0,
            'meronymy': 0,
            'association': 0
        })
    for row in rows:
        key = _utils.key(row, KEY_SCHEMA.keys())
        key_to_type_scores[key]['total_votes'] += 1
        key_to_type_scores[key]['ontological'] += int(row.get('ontological', 0))
        key_to_type_scores[key]['capability'] += int(row.get('capability', 0))
        key_to_type_scores[key]['location'] += int(row.get('location', 0))
        key_to_type_scores[key]['physical'] += int(row.get('physical', 0))
        key_to_type_scores[key]['non-physical'] += int(row.get('non-physical', 0))
        key_to_type_scores[key]['meronymy'] += int(row.get('meronymy', 0))
        key_to_type_scores[key]['association'] += int(row.get('association', 0))

    # create the new rows by processing the aggregated types
    new_row_strs = []
    for key, type_scores in key_to_type_scores.items():
        total_votes = type_scores.pop('total_votes')
        assert total_votes == EXPECTED_NUM_VOTES, (
            f'{key} only has {total_votes} annotations.'
            f' It should have exactly {EXPECTED_NUM_VOTES}.'
        )

        # create the new row

        # use an OrderedDict so the keys appear in the right order in
        # the JSON.
        new_row = collections.OrderedDict([
            (attribute, as_type(value))
            for (attribute, as_type), value
            in zip(KEY_SCHEMA.items(), key)
        ])

        # compute new attributes to add
        types = {
            type_: score > (EXPECTED_NUM_VOTES / 2.0)
            for type_, score in type_scores.items()
        }

        # add the new attributes
        new_row['types'] = types
        new_row['type_scores'] = type_scores

        new_row_strs.append(json.dumps(new_row))

    # write out the data
    with click.open_file(output_path, 'w') as output_file:
        output_file.write('\n'.join(sorted(new_row_strs)))
示例#5
0
def extractlabels(xml_dir, output_path):
    """Extract labeling data from XML_DIR and write to OUTPUT_PATH.

    Extract the subject-question pair labeling data from a batch of the
    question labeling HITs. XML_DIR should be an XML directory extracted
    with AMTI. OUTPUT_PATH is the location to which the data will be
    written in a JSON Lines format. Each instance will have a "labels"
    attribute, which is a list of the labels, and a "majority" attribute
    giving the majority (true / false) vote, a "true_votes" attribute
    giving the number of votes for "true", and an "is_bad" attribute
    giving whether or not any annotators labeled the assertion as "bad".
    """
    # submissions : the form data submitted from the question labeling
    # HITs as a list of dictionaries mapping the question identifiers to
    # the free text, i.e.:
    #
    #     [
    #       {
    #         'attribute-idx': attribute_value,
    #         ...
    #       },
    #       ...
    #     ]
    #
    # See the data for individual attributes and values. The index (idx)
    # is used because each HIT had the worker label multiple instances
    # for efficiency purposes.
    submissions = _utils.extract_xml_dir(xml_dir)

    # decode the data from the ``"attribute-idx": value`` style to the
    # individual rows.
    rows = _utils.decode_attribute_idx_data(submissions)

    # aggregate all the labels for each instance, since we had multiple
    # assignments / workers per instance.
    key_to_labels = collections.defaultdict(list)
    for row in rows:
        key = _utils.key(row, KEY_SCHEMA.keys())
        key_to_labels[key].append(row['label'])

    # create the new rows by processing the aggregated labels
    new_row_strs = []
    for key, labels in key_to_labels.items():
        assert len(labels) == EXPECTED_NUM_LABELS, (
            f'{key} only has {len(labels)} assertion labels.'
            f' It should have exactly {EXPECTED_NUM_LABELS}.'
        )

        # create the new row

        # use an OrderedDict so the keys appear in the right order in
        # the JSON.
        new_row = collections.OrderedDict([
            (attribute, as_type(value))
            for (attribute, as_type), value
            in zip(KEY_SCHEMA.items(), key)
        ])

        # compute new attributes to add
        is_bad = 'bad' in labels
        true_votes = sum([LABEL_TO_BIT[label] for label in labels])
        majority =  true_votes > (len(labels) / 2.0)

        # add the new attributes
        new_row['labels'] = labels
        new_row['is_bad'] = is_bad
        new_row['true_votes'] = true_votes
        new_row['majority'] = majority

        new_row_strs.append(json.dumps(new_row))

    # write out the data
    with click.open_file(output_path, 'w') as output_file:
        output_file.write('\n'.join(sorted(new_row_strs)))
示例#6
0
def extractmirrorsubjects(xml_dir, output_path):
    """Extract mirror subjects from XML_DIR and write to OUTPUT_PATH.

    Extract mirror subject data from a batch of the mirror subjects
    HITs. XML_DIR should be an XML directory extracted with AMTI.
    OUTPUT_PATH is the location to which the data will be written in
    JSON Lines format.
    """
    # submissions : the form data submitted from the
    # mirror-subjects HITs as a list of dictionaries mapping the
    # question identifiers to the free text, i.e.:
    #
    #     [
    #       {
    #         'attribute-idx': attribute_value,
    #         ...
    #       },
    #       ...
    #     ]
    #
    # See the data for individual attributes and values. The index (idx)
    # is used because each HIT had the worker label multiple instances
    # for efficiency purposes.
    submissions = _utils.extract_xml_dir(xml_dir)

    # decode the data from the ``"attribute-idx": value`` style to the
    # individual rows.
    rows = _utils.decode_attribute_idx_data(submissions)

    # coerce the data types correctly and add in the new attribute.
    new_subjects_skipped = 0
    new_row_strs = []
    for row in rows:
        # create the new row

        # use an OrderedDict so that the keys appear in the right order
        # in the JSON.
        new_row = collections.OrderedDict([
            (attribute, as_type(row[attribute]))
            for attribute, as_type in KEY_SCHEMA.items()
        ])

        # clean up the raw text of the new subject
        # strip whitespace and lowercase
        new_subject = row['new_subject']\
            .strip()\
            .lower()
        # remove beginning and ending punctuation
        normalized_new_subject = re.sub(r'(^[^a-z0-9]|[^a-z0-9]$)', '',
                                        new_subject)

        # filter out bad examples using a few rules
        unexpected_format = not re.match(r'^[a-z0-9-]+$',
                                         normalized_new_subject)
        too_long = len(normalized_new_subject) > 20
        if unexpected_format or too_long:
            logger.warning(f'Skipping new subject "{normalized_new_subject}".')
            new_subjects_skipped += 1
            continue

        if normalized_new_subject != new_subject:
            logger.warning(f'New subject {new_subject} was modified to'
                           f' {normalized_new_subject}.')

        new_row['subject'] = normalized_new_subject
        new_row['answer'] = None

        # delete the irrelevant label attributes since they don't apply
        # to the new subject
        del new_row['labels']
        del new_row['is_bad']
        del new_row['true_votes']
        del new_row['majority']

        new_row_strs.append(json.dumps(new_row))

    if new_subjects_skipped > 0:
        logger.warning(f'{new_subjects_skipped} new subjects were skipped.')

    # write out the data
    with click.open_file(output_path, 'w') as output_file:
        output_file.write('\n'.join(sorted(new_row_strs)))