parsed_description = BeautifulSoup(pair['description'], 'html.parser') references = get_valid_reference_anchors(parsed_description) external_references = list(map(reference_structure_from_anchor, references)) for ref in references: mark_as_recorded(ref) pair['externalReferences'] = [] if len( external_references) < 1 else external_references pair['description'] = str(parsed_description) finalize_descriptions(pair) return pair def finalize_descriptions(pair): pair['description'] = pl.clean_html(pair['description']) def reference_structure_from_anchor(reference): return {"sourceUrl": reference.get('href'), "title": reference.get_text()} def mark_as_recorded(anchor): anchor['href'] = '' anchor.name = 'span' if __name__ == '__main__': module_attr_pairs = pl.read_json_to_dict(sys.argv[1]) updated_pairs = record_references_inside_pairs(module_attr_pairs) pl.write_pretty_json(updated_pairs)
def section_parent_page(sect_div): parent_section_id = sect_div.parent.div.div.div.find('a').get('id') sections = parent_section_id.split('.') try: cutoff_index = sections.index('1') return '.'.join(sections[0:cutoff_index]) except ValueError: return parent_section_id def get_refs_from_pairs(pairs): refs_to_record = set() for pair in pairs: ref_page_id_pairs = map(get_location_from_ref, pair['externalReferences']) for ref in ref_page_id_pairs: refs_to_record.add(ref) return refs_to_record def get_location_from_ref(ref): return tuple(pl.get_short_html_location(ref['sourceUrl']).split('/')) if __name__ == '__main__': module_attr_pairs = pl.read_json_to_dict(sys.argv[1]) section_listing = pl.read_json_to_dict(sys.argv[2]) references = find_reference_html_in_sections(module_attr_pairs, section_listing) pl.write_pretty_json(references)
def module_attribute_relationship(make_standard): return pl.read_json_to_dict('standard/module_to_attributes.json')
for attribute in module['attributes']: entries.append({ 'module': module['id'], 'path': attribute['id'], 'tag': attribute['tag'], 'type': attribute['type'], 'linkToStandard': get_standard_link(module, attribute), 'description': attribute['description'] }) return entries def get_standard_link(module, attribute): if 'linkToStandard' not in attribute.keys(): return module['linkToStandard'] else: return attribute['linkToStandard'] if __name__ == "__main__": module_attr_list = pl.read_json_to_dict(sys.argv[1]) module_attr_relationship_list = module_attr_relationship_table( module_attr_list) pl.write_pretty_json(module_attr_relationship_list)
def attributes(make_standard): return pl.read_json_to_dict('standard/attributes.json')
def ciod_module_relationship(make_standard): return pl.read_json_to_dict('standard/ciod_to_modules.json')
def modules(make_standard): return pl.read_json_to_dict('standard/modules.json')
def ciods(make_standard): return pl.read_json_to_dict('standard/ciods.json')
''' Convert the processed module-attribute JSON data into a normalized listing of all modules in the DICOM Standard. ''' import sys from dicom_standard import parse_lib as pl def modules_from_tables(tables): modules = {} for module in tables: modules[module['id']] = { 'id': module['id'], 'name': module['name'], 'description': pl.clean_html(module['description']), 'linkToStandard': module['linkToStandard'] } return modules if __name__ == '__main__': module_attr_tables = pl.read_json_to_dict(sys.argv[1]) modules = modules_from_tables(module_attr_tables) pl.write_pretty_json(modules)
''' Takes the extracted CIOD information and processes it to produce a dictionary of all CIODs in the DICOM Standard. ''' import sys from dicom_standard import parse_lib as pl def ciods_from_extracted_list(ciod_module_list): ciods = {} for ciod in ciod_module_list: ciods[ciod['id']] = { 'id': ciod['id'], 'description': pl.clean_html(ciod['description']), 'linkToStandard': ciod['linkToStandard'], 'name': ciod['name'] } return ciods if __name__ == '__main__': ciod_module_list = pl.read_json_to_dict(sys.argv[1]) ciods = ciods_from_extracted_list(ciod_module_list) pl.write_pretty_json(ciods)
import sys from dicom_standard import parse_lib as pl def update_sourceurls(module_attr_pairs, references): for pair in module_attr_pairs: for ref in pair['externalReferences']: for source_url in references.keys(): reference_fragment = source_url.split('#')[-1] pair_fragment = ref['sourceUrl'].split('#')[-1] if pair_fragment == reference_fragment: ref['sourceUrl'] = source_url break return module_attr_pairs if __name__ == '__main__': module_attr_pairs = pl.read_json_to_dict(sys.argv[1]) references = pl.read_json_to_dict(sys.argv[2]) updated_pairs = update_sourceurls(module_attr_pairs, references) pl.write_pretty_json(updated_pairs)