def extractquestions(xml_dir, output_path): """Extract questions from XML_DIR and write to OUTPUT_PATH. Extract all unique subject-question-answer triples from a batch of 20 Questions HITs. XML_DIR should be the XML directory of one of the 20 Questions HIT batches, extracted with AMTI. OUTPUT_PATH is the location to which the data will be written. """ # submissions : the form data submitted from the twentyquestions # HITs as a list of dictionaries mapping the question identifiers # to the free text, i.e.: # # [{'gameRoomJson': game_room_json_string}, ...] # submissions = _utils.extract_xml_dir(xml_dir) # extract the rows from the game room jsons row_strs = set() for submission in submissions: data = json.loads(submission['gameRoomJson']) # generate all the subject-question-answer triples created # during the game. subject = data['game']['round']['subject'] for questionAndAnswer in data['game']['round']['questionAndAnswers']: # use an OrderedDict so the keys appear in the right order # in the JSON. row = collections.OrderedDict([ ('subject', subject), ('question', questionAndAnswer['question']['questionText']), ('answer', questionAndAnswer['answer']['answerValue']) ]) row_strs.add(json.dumps(row)) # write out the data with click.open_file(output_path, 'w') as output_file: output_file.write('\n'.join(sorted(row_strs)))
def extractgames(xml_dir, output_path): """Extract games from XML_DIR and write to OUTPUT_PATH. Extract the 20 Questions game data from a batch of 20 Questions HITs. XML_DIR should be the XML directory of one of the 20 Questions HIT batches, extracted with AMTI. OUTPUT_PATH is the location to which the data will be written. """ # submissions : the form data submitted from the twentyquestions # HITs as a list of dictionaries mapping the question identifiers to # the free text, i.e.: # # [{'gameRoomJson': game_room_json_string}, ...] # submissions = _utils.extract_xml_dir(xml_dir) # deduplicate the games because each crowdworker who participates in # the game submits a copy of the game data. game_jsons = set( [submission['gameRoomJson'] for submission in submissions]) # write out the data with click.open_file(output_path, 'w') as output_file: output_file.write('\n'.join(game_jsons))
def extractquality(xml_dir, output_path): """Extract quality labels from XML_DIR and write to OUTPUT_PATH. Extract the quality annotations from a batch of the quality control HITs. XML_DIR should be an XML directory extracted with AMTI. OUTPUT_PATH is the location to which the data will be written in JSON Lines format. High quality questions will be marked with the "high_quality" attribute as True, where high quality means 2 of the 3 workers rated it 'good'. Note, this script assumes that all the batches had all 3 assignments completed. If there are not 3 completed assignments for each HIT, the script will throw an error. """ # submissions : the form data submitted from the quality control # HITs as a list of dictionaries mapping the question identifiers to # the free text, i.e.: # # [ # { # 'attribute-idx': attribute_value, # ... # }, # ... # ] # # See the data for individual attributes and values. The index (idx) # is used because each HIT had the worker label multiple instances # for efficiency purposes. submissions = _utils.extract_xml_dir(xml_dir) # decode the data from the ``"attribute-idx": value`` style to the # individual rows. rows = _utils.decode_attribute_idx_data(submissions) # aggregate all the quality labels for each instance, since we had # multiple assignments / workers per instance. key_to_qualities = collections.defaultdict(list) for row in rows: key = _utils.key(row, KEY_SCHEMA.keys()) key_to_qualities[key].append(row['quality']) # create the new rows by processing the aggregated quality labels new_row_strs = [] for key, qualities in key_to_qualities.items(): assert len(qualities) == EXPECTED_NUM_QUALITIES, ( f'{key} only has {len(qualities)} quality labels.' f' It should have exactly {EXPECTED_NUM_QUALITIES}') # create the new row # use an OrderedDict so that the keys appear in the right order # in the JSON. new_row = collections.OrderedDict([ (attribute, as_type(value)) for (attribute, as_type), value in zip(KEY_SCHEMA.items(), key) ]) # compute new attributes to add score = sum([QUALITY_TO_BIT[quality] for quality in qualities]) high_quality = score >= MIN_SCORE # add the new attributes new_row['quality_labels'] = qualities new_row['score'] = score new_row['high_quality'] = high_quality new_row_strs.append(json.dumps(new_row)) # write out the data with click.open_file(output_path, 'w') as output_file: output_file.write('\n'.join(sorted(new_row_strs)))
def extracttypes(xml_dir, output_path): """Extract commonsense types from XML_DIR and write to OUTPUT_PATH. Extract the commonsense types for each subject-question pair from a batch of the commonsense type HITs. XML_DIR should be an XML directory extracted with AMTI. OUTPUT_PATH is the location to which the data will be written in a JSON Lines format. Each instance will have a "types" attribute that is a dictionary mapping each type to a true or false label. Additionally, each instance will also have a "type_scores" attribute which gives the raw count of votes for each type. """ # submissions : the form data submitted from the commonsense type # HITs as a list of dictionaries mapping the question identifiers to # the free text, i.e.: # # [ # { # 'attribute-idx': attribute_value, # ... # }, # ... # ] # # See the data for individual attributes and values. The index (idx) # is used because each HIT had the worker label multiple instances # for efficiency purposes. submissions = _utils.extract_xml_dir(xml_dir) # decode the data from the ``"attribute-idx": value`` style to the # individual rows. rows = _utils.decode_attribute_idx_data(submissions) # aggregate all the type labels for each instance, since we had # multiple assignments / workers per instance. key_to_type_scores = collections.defaultdict( lambda: { 'total_votes': 0, 'ontological': 0, 'capability': 0, 'location': 0, 'physical': 0, 'non-physical': 0, 'meronymy': 0, 'association': 0 }) for row in rows: key = _utils.key(row, KEY_SCHEMA.keys()) key_to_type_scores[key]['total_votes'] += 1 key_to_type_scores[key]['ontological'] += int(row.get('ontological', 0)) key_to_type_scores[key]['capability'] += int(row.get('capability', 0)) key_to_type_scores[key]['location'] += int(row.get('location', 0)) key_to_type_scores[key]['physical'] += int(row.get('physical', 0)) key_to_type_scores[key]['non-physical'] += int(row.get('non-physical', 0)) key_to_type_scores[key]['meronymy'] += int(row.get('meronymy', 0)) key_to_type_scores[key]['association'] += int(row.get('association', 0)) # create the new rows by processing the aggregated types new_row_strs = [] for key, type_scores in key_to_type_scores.items(): total_votes = type_scores.pop('total_votes') assert total_votes == EXPECTED_NUM_VOTES, ( f'{key} only has {total_votes} annotations.' f' It should have exactly {EXPECTED_NUM_VOTES}.' ) # create the new row # use an OrderedDict so the keys appear in the right order in # the JSON. new_row = collections.OrderedDict([ (attribute, as_type(value)) for (attribute, as_type), value in zip(KEY_SCHEMA.items(), key) ]) # compute new attributes to add types = { type_: score > (EXPECTED_NUM_VOTES / 2.0) for type_, score in type_scores.items() } # add the new attributes new_row['types'] = types new_row['type_scores'] = type_scores new_row_strs.append(json.dumps(new_row)) # write out the data with click.open_file(output_path, 'w') as output_file: output_file.write('\n'.join(sorted(new_row_strs)))
def extractlabels(xml_dir, output_path): """Extract labeling data from XML_DIR and write to OUTPUT_PATH. Extract the subject-question pair labeling data from a batch of the question labeling HITs. XML_DIR should be an XML directory extracted with AMTI. OUTPUT_PATH is the location to which the data will be written in a JSON Lines format. Each instance will have a "labels" attribute, which is a list of the labels, and a "majority" attribute giving the majority (true / false) vote, a "true_votes" attribute giving the number of votes for "true", and an "is_bad" attribute giving whether or not any annotators labeled the assertion as "bad". """ # submissions : the form data submitted from the question labeling # HITs as a list of dictionaries mapping the question identifiers to # the free text, i.e.: # # [ # { # 'attribute-idx': attribute_value, # ... # }, # ... # ] # # See the data for individual attributes and values. The index (idx) # is used because each HIT had the worker label multiple instances # for efficiency purposes. submissions = _utils.extract_xml_dir(xml_dir) # decode the data from the ``"attribute-idx": value`` style to the # individual rows. rows = _utils.decode_attribute_idx_data(submissions) # aggregate all the labels for each instance, since we had multiple # assignments / workers per instance. key_to_labels = collections.defaultdict(list) for row in rows: key = _utils.key(row, KEY_SCHEMA.keys()) key_to_labels[key].append(row['label']) # create the new rows by processing the aggregated labels new_row_strs = [] for key, labels in key_to_labels.items(): assert len(labels) == EXPECTED_NUM_LABELS, ( f'{key} only has {len(labels)} assertion labels.' f' It should have exactly {EXPECTED_NUM_LABELS}.' ) # create the new row # use an OrderedDict so the keys appear in the right order in # the JSON. new_row = collections.OrderedDict([ (attribute, as_type(value)) for (attribute, as_type), value in zip(KEY_SCHEMA.items(), key) ]) # compute new attributes to add is_bad = 'bad' in labels true_votes = sum([LABEL_TO_BIT[label] for label in labels]) majority = true_votes > (len(labels) / 2.0) # add the new attributes new_row['labels'] = labels new_row['is_bad'] = is_bad new_row['true_votes'] = true_votes new_row['majority'] = majority new_row_strs.append(json.dumps(new_row)) # write out the data with click.open_file(output_path, 'w') as output_file: output_file.write('\n'.join(sorted(new_row_strs)))
def extractmirrorsubjects(xml_dir, output_path): """Extract mirror subjects from XML_DIR and write to OUTPUT_PATH. Extract mirror subject data from a batch of the mirror subjects HITs. XML_DIR should be an XML directory extracted with AMTI. OUTPUT_PATH is the location to which the data will be written in JSON Lines format. """ # submissions : the form data submitted from the # mirror-subjects HITs as a list of dictionaries mapping the # question identifiers to the free text, i.e.: # # [ # { # 'attribute-idx': attribute_value, # ... # }, # ... # ] # # See the data for individual attributes and values. The index (idx) # is used because each HIT had the worker label multiple instances # for efficiency purposes. submissions = _utils.extract_xml_dir(xml_dir) # decode the data from the ``"attribute-idx": value`` style to the # individual rows. rows = _utils.decode_attribute_idx_data(submissions) # coerce the data types correctly and add in the new attribute. new_subjects_skipped = 0 new_row_strs = [] for row in rows: # create the new row # use an OrderedDict so that the keys appear in the right order # in the JSON. new_row = collections.OrderedDict([ (attribute, as_type(row[attribute])) for attribute, as_type in KEY_SCHEMA.items() ]) # clean up the raw text of the new subject # strip whitespace and lowercase new_subject = row['new_subject']\ .strip()\ .lower() # remove beginning and ending punctuation normalized_new_subject = re.sub(r'(^[^a-z0-9]|[^a-z0-9]$)', '', new_subject) # filter out bad examples using a few rules unexpected_format = not re.match(r'^[a-z0-9-]+$', normalized_new_subject) too_long = len(normalized_new_subject) > 20 if unexpected_format or too_long: logger.warning(f'Skipping new subject "{normalized_new_subject}".') new_subjects_skipped += 1 continue if normalized_new_subject != new_subject: logger.warning(f'New subject {new_subject} was modified to' f' {normalized_new_subject}.') new_row['subject'] = normalized_new_subject new_row['answer'] = None # delete the irrelevant label attributes since they don't apply # to the new subject del new_row['labels'] del new_row['is_bad'] del new_row['true_votes'] del new_row['majority'] new_row_strs.append(json.dumps(new_row)) if new_subjects_skipped > 0: logger.warning(f'{new_subjects_skipped} new subjects were skipped.') # write out the data with click.open_file(output_path, 'w') as output_file: output_file.write('\n'.join(sorted(new_row_strs)))