def process_volume(anthology_volume): collection_id, volume_id, _ = deconstruct_anthology_id(anthology_volume) print(f'Attempting to add DOIs for {anthology_volume}', file=sys.stderr) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = ET.parse(xml_file) formatter = MarkupFormatter() num_added = 0 volume = tree.getroot().find(f"./volume[@id='{volume_id}']") if volume is not None: volume_booktitle = volume.find(f"./meta/booktitle") volume_title = formatter.as_text(volume_booktitle) print(f'-> found existing volume "{volume_title}"', file=sys.stderr) # Iterate through all papers for paper in chain(volume.find('frontmatter'), volume.findall('paper')): added = add_doi(paper, collection_id, volume_id, force=args.force) if added: num_added += 1 sleep(1) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> added {num_added} DOIs to to the XML for collection {collection_id}', file=sys.stderr) else: print(f'-> FATAL: volume {volume} not found in the Anthology', file=sys.stderr) sys.exit(1)
def main(volumes): formatter = MarkupFormatter() ## Assemble container doi_batch = make_simple_element( "doi_batch", attrib={ "xmlns": "http://www.crossref.org/schema/4.4.1", "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation": "http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd", "version": "4.4.1", }, namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"}, ) new_volume = etree.ElementTree(doi_batch) ## Assemble head head = make_simple_element("head", parent=new_volume.getroot()) dbi = make_simple_element("doi_batch_id", text=str(int(time.time())), parent=head) timestamp = make_simple_element("timestamp", text=str(int(time.time())), parent=head) depositor = make_simple_element("depositor", parent=head) depositor_name = make_simple_element("depositor_name", text=DEPOSITOR_NAME, parent=depositor) email_address = make_simple_element("email_address", text=EMAIL_ADDRESS, parent=depositor) registrant = make_simple_element("registrant", text=REGISTRANT, parent=head) ## Assemble body body = make_simple_element("body", parent=new_volume.getroot()) year = "" start_month = "" end_month = "" for full_volume_id in sorted(volumes): collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id) collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = etree.parse(collection_file) v = tree.getroot().find(f"./volume[@id='{volume_id}']") if v is None: print(f"* Can't find volume {full_volume_id}", file=sys.stderr) continue ## Assemble frontmatter c = make_simple_element("conference", parent=body) contribs = make_simple_element("contributors", parent=c) editor_index = 0 meta = v.find("./meta") for tag in meta: if tag.tag == "year": year = tag.text elif tag.tag == "month": month = tag.text try: start_month = MONTH_HASH[re.split("[-–]", month)[0]] end_month = MONTH_HASH[re.split("[-–]", month)[1]] except IndexError as e: # only one month start_month = MONTH_HASH[month] end_month = MONTH_HASH[month] except Exception as e: print( f"FATAL: can't parse month {month} in {full_volume_id}", file=sys.stderr, ) sys.exit(1) elif tag.tag == "url": url = tag.text elif tag.tag == "booktitle": booktitle = formatter.as_text(tag) elif tag.tag == "address": address = tag.text elif tag.tag == "publisher": publisher = tag.text elif tag.tag == "editor": pn = make_simple_element( "person_name", parent=contribs, attrib={ "contributor_role": "chair", "sequence": "first" if editor_index == 0 else "additional", }, ) editor_index += 1 for name_part in tag: if name_part.tag == "first": gn = make_simple_element("given_name", parent=pn, text=name_part.text) elif name_part.tag == "last": sn = make_simple_element("surname", text=name_part.text, parent=pn) # Assemble Event Metadata em = make_simple_element("event_metadata", parent=c) cn = make_simple_element("conference_name", parent=em, text=booktitle) cl = make_simple_element("conference_location", parent=em, text=address) cd = make_simple_element( "conference_date", parent=em, attrib={ "start_year": year, "end_year": year, "start_month": start_month, "end_month": end_month, }, ) # Assemble Proceedings Metadata pm = make_simple_element("proceedings_metadata", parent=c, attrib={"language": "en"}) pt = make_simple_element("proceedings_title", parent=pm, text=booktitle) p = make_simple_element("publisher", parent=pm) pn = make_simple_element("publisher_name", parent=p, text=publisher) pp = make_simple_element("publisher_place", parent=p, text=PUBLISHER_PLACE) pd = make_simple_element("publication_date", parent=pm) y = make_simple_element("year", parent=pd, text=year) noisbn = make_simple_element("noisbn", parent=pm, attrib={"reason": "simple_series"}) # DOI assignation data dd = make_simple_element("doi_data", parent=pm) doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url) resource = make_simple_element("resource", parent=dd, text=ANTHOLOGY_URL.format(url)) for paper in v.findall("./paper"): ## Individual Paper Data # TODO: this is not future-proof, should use anthology.util library functions aa_id = "" if len(url) == 6: aa_id = "{:02d}".format(int(paper.attrib["id"])) else: if len(url) == 5: aa_id = "{:03d}".format(int(paper.attrib["id"])) cp = make_simple_element("conference_paper", parent=c) # contributors contribs = make_simple_element("contributors", parent=cp) author_index = 0 for author in paper.findall("./author"): pn = make_simple_element( "person_name", parent=contribs, attrib={ "contributor_role": "author", "sequence": "first" if author_index == 0 else "additional", }, ) author_index += 1 for name_part in author: if name_part.tag == "first": gn = make_simple_element("given_name", parent=pn, text=name_part.text) elif name_part.tag == "last": sn = make_simple_element("surname", text=name_part.text, parent=pn) for title in paper.iter(tag="title"): o_titles = make_simple_element("titles", parent=cp) o_title = make_simple_element("title", parent=o_titles, text=formatter.as_text(title)) pd = make_simple_element("publication_date", parent=cp) o_year = make_simple_element("year", parent=pd) o_year.text = year for pages in paper.iter(tag="pages"): o_pages = make_simple_element("pages", parent=cp) fp = make_simple_element("first_page", parent=o_pages) lp = make_simple_element("last_page", parent=o_pages) try: fp.text = re.split("[-–]", pages.text)[0] lp.text = re.split("[-–]", pages.text)[1] except IndexError as e: # only one page fp.text = pages.text lp.text = pages.text # DOI assignation data dd = make_simple_element("doi_data", parent=cp) doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url + aa_id) resource = make_simple_element("resource", parent=dd, text=ANTHOLOGY_URL.format(url + aa_id)) print( etree.tostring( new_volume, pretty_print=True, encoding="UTF-8", xml_declaration=True, with_tail=True, ).decode("utf-8"))
def main(volumes): formatter = MarkupFormatter() ## Assemble container doi_batch = make_simple_element( 'doi_batch', attrib={ 'xmlns': 'http://www.crossref.org/schema/4.4.1', '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd', 'version': '4.4.1' }, namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}) new_volume = etree.ElementTree(doi_batch) ## Assemble head head = make_simple_element('head', parent=new_volume.getroot()) dbi = make_simple_element('doi_batch_id', text=str(int(time.time())), parent=head) timestamp = make_simple_element('timestamp', text=str(int(time.time())), parent=head) depositor = make_simple_element('depositor', parent=head) depositor_name = make_simple_element('depositor_name', text=DEPOSITOR_NAME, parent=depositor) email_address = make_simple_element('email_address', text=EMAIL_ADDRESS, parent=depositor) registrant = make_simple_element('registrant', text=REGISTRANT, parent=head) ## Assemble body body = make_simple_element('body', parent=new_volume.getroot()) year = "" start_month = "" end_month = "" for full_volume_id in sorted(volumes): collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id) collection_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = etree.parse(collection_file) v = tree.getroot().find(f"./volume[@id='{volume_id}']") if v is None: print(f"* Can't find volume {full_volume_id}", file=sys.stderr) continue ## Assemble frontmatter c = make_simple_element('conference', parent=body) contribs = make_simple_element('contributors', parent=c) editor_index = 0 meta = v.find('./meta') for tag in meta: if tag.tag == 'year': year = tag.text elif tag.tag == 'month': month = tag.text try: start_month = MONTH_HASH[re.split('[-–]', month)[0]] end_month = MONTH_HASH[re.split('[-–]', month)[1]] except IndexError as e: # only one month start_month = MONTH_HASH[month] end_month = MONTH_HASH[month] elif tag.tag == 'url': url = tag.text elif tag.tag == 'booktitle': booktitle = tag.text elif tag.tag == 'address': address = tag.text elif tag.tag == 'publisher': publisher = tag.text elif tag.tag == 'editor': pn = make_simple_element( 'person_name', parent=contribs, attrib={ 'contributor_role': 'chair', 'sequence': 'first' if editor_index == 0 else 'additional' }) editor_index += 1 for name_part in tag: if name_part.tag == 'first': gn = make_simple_element('given_name', parent=pn, text=name_part.text) elif name_part.tag == 'last': sn = make_simple_element('surname', text=name_part.text, parent=pn) # Assemble Event Metadata em = make_simple_element('event_metadata', parent=c) cn = make_simple_element('conference_name', parent=em, text=booktitle) cl = make_simple_element('conference_location', parent=em, text=address) cd = make_simple_element('conference_date', parent=em, attrib={ 'start_year': year, 'end_year': year, 'start_month': start_month, 'end_month': end_month }) # Assemble Proceedings Metadata pm = make_simple_element('proceedings_metadata', parent=c, attrib={'language': 'en'}) pt = make_simple_element('proceedings_title', parent=pm, text=booktitle) p = make_simple_element('publisher', parent=pm) pn = make_simple_element('publisher_name', parent=p, text=publisher) pp = make_simple_element('publisher_place', parent=p, text=PUBLISHER_PLACE) pd = make_simple_element('publication_date', parent=pm) y = make_simple_element('year', parent=pd, text=year) noisbn = make_simple_element('noisbn', parent=pm, attrib={'reason': 'simple_series'}) # DOI assignation data dd = make_simple_element('doi_data', parent=pm) doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url) resource = make_simple_element('resource', parent=dd, text=ANTHOLOGY_URL.format(url)) for paper in v.findall('./paper'): ## Individual Paper Data # TODO: this is not future-proof, should use anthology.util library functions aa_id = "" if (len(url) == 6): aa_id = '{:02d}'.format(int(paper.attrib['id'])) else: if (len(url) == 5): aa_id = '{:03d}'.format(int(paper.attrib['id'])) cp = make_simple_element('conference_paper', parent=c) # contributors contribs = make_simple_element('contributors', parent=cp) author_index = 0 for author in paper.findall('./author'): pn = make_simple_element( 'person_name', parent=contribs, attrib={ 'contributor_role': 'author', 'sequence': 'first' if author_index == 0 else 'additional' }) author_index += 1 for name_part in author: if name_part.tag == 'first': gn = make_simple_element('given_name', parent=pn, text=name_part.text) elif name_part.tag == 'last': sn = make_simple_element('surname', text=name_part.text, parent=pn) for title in paper.iter(tag='title'): o_titles = make_simple_element('titles', parent=cp) o_title = make_simple_element('title', parent=o_titles, text=formatter.as_text(title)) pd = make_simple_element('publication_date', parent=cp) o_year = make_simple_element('year', parent=pd) o_year.text = year for pages in paper.iter(tag='pages'): o_pages = make_simple_element('pages', parent=cp) fp = make_simple_element('first_page', parent=o_pages) lp = make_simple_element('last_page', parent=o_pages) try: fp.text = re.split('[-–]', pages.text)[0] lp.text = re.split('[-–]', pages.text)[1] except IndexError as e: # only one page fp.text = pages.text lp.text = pages.text # DOI assignation data dd = make_simple_element('doi_data', parent=cp) doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url + aa_id) resource = make_simple_element('resource', parent=dd, text=ANTHOLOGY_URL.format(url + aa_id)) print( etree.tostring(new_volume, pretty_print=True, encoding='UTF-8', xml_declaration=True, with_tail=True).decode('utf-8'))