def disambiguate_name(node): name = PersonName.from_element(node) ids = people.get_ids(name) if node.tag == "editor": # meta block, get volume anth_id = build_anthology_id( collection_id, node.getparent().getparent().attrib["id"]) elif node.tag == "author": # paper, get full ID anth_id = build_anthology_id( collection_id, node.getparent().getparent().attrib["id"], node.getparent().attrib["id"], ) if len(ids) > 1: choice = -1 while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) node.attrib["id"] = ids[choice]
def add_doi(xml_node, collection_id, volume_id, force=False): if 'id' in xml_node.attrib: # normal paper paper_id = int(xml_node.attrib['id']) else: # frontmatter paper_id = 0 anth_id = build_anthology_id(collection_id, volume_id, paper_id) new_doi_text = f'{data.DOI_PREFIX}{anth_id}' doi = xml_node.find('doi') if doi is not None: print(f'-> [{anth_id}] Cowardly refusing to overwrite existing DOI {doi.text} (use --force)', file=sys.stderr) return False doi_url = f'{data.DOI_URL_PREFIX}{data.DOI_PREFIX}{anth_id}' for tries in [1, 2, 3]: # lots of random failures result = test_url_code(doi_url) if result.status_code == 200: doi = make_simple_element('doi', text=new_doi_text) print(f'-> Adding DOI {new_doi_text}', file=sys.stderr) xml_node.append(doi) return True elif result.status_code == 429: # too many requests pause_for = int(result.headers['Retry-After']) print(f'--> Got 429, pausing for {pause_for} seconds', file=sys.stderr) sleep(pause_for + 1) elif result.status_code == 404: # not found break print(f"-> Couldn't add DOI {doi_url}", file=sys.stderr) return False
def add_doi(xml_node, collection_id, volume_id, force=False): if 'id' in xml_node.attrib: # normal paper paper_id = int(xml_node.attrib['id']) else: # frontmatter paper_id = 0 anth_id = build_anthology_id(collection_id, volume_id, paper_id) new_doi_text = f'{data.DOI_PREFIX}{anth_id}' doi_url = f'{data.DOI_URL_PREFIX}{data.DOI_PREFIX}{anth_id}' if not test_url(doi_url): print(f"-> [{anth_id}] Skipping since DOI {doi_url} doesn't exist") return False doi = xml_node.find('doi') if doi is not None: print( f'-> [{anth_id}] Cowardly refusing to overwrite existing DOI {doi.text} (use --force)', file=sys.stderr) return False else: doi = make_simple_element('doi', text=new_doi_text) print(f'Adding DOI {new_doi_text}', file=sys.stderr) xml_node.append(doi) return True
srcdir=os.path.join(os.path.dirname(sys.argv[0]), "..", "data")) pdf_directory = os.path.dirname(args.infile) tree_being_added = etree.parse(args.infile) # Ensure nested format root_being_added = make_nested(tree_being_added.getroot(), pdf_path=os.path.dirname(args.infile)) collection_id = root_being_added.attrib["id"] # Ensure names are properly identified ambiguous = {} for paper in root_being_added.findall(".//paper"): anth_id = build_anthology_id(collection_id, paper.getparent().attrib["id"], paper.attrib["id"]) for node in chain(paper.findall("author"), paper.findall("editor")): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: print( f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}" ) ambiguous[anth_id] = (name, ids) node.attrib["id"] = ids[0] # Ensure PDF exists. PDF should be in the same directory as the XML file being ingested. pdf_path = os.path.join(pdf_directory, f"{anth_id}.pdf")
def main(args): collections = defaultdict(OrderedDict) volumes = {} # Build list of volumes, confirm uniqueness for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) meta["path"] = proceedings meta["collection_id"] = collection_id = (meta["year"] + "." + meta["abbrev"].lower()) volume_name = meta["volume_name"] volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume_name"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) print(f"VOLUME: {volume}") # copy the book book_src_filename = meta["abbrev"] + "-" + meta["year"] book_src_path = os.path.join(root_path, book_src_filename) + ".pdf" book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run) if not args.dry_run: shutil.copyfile(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # names are {abbrev}{number}.pdf abbrev = meta["abbrev"] match = re.match(rf"{abbrev}(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") log( f"Copying [{paper_id_full}] {pdf_src_path} -> {pdf_dest_path}", args.dry_run, ) if not args.dry_run: shutil.copyfile(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, collection_id) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): match = re.match(rf"{abbrev}(\d+)_(\w+)\.(\w+)") if match is not None: paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) if not args.dry_run: shutil.copyfile(attachment_file, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append(dest_path) people = AnthologyIndex(None, srcdir=os.path.join(os.path.dirname(sys.argv[0]), "..", "data")) for collection_id, collection in collections.items(): collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element("volume", attrib={"id": volume_id}, parent=root_node) meta = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) # print(etree.tostring(paper_node, pretty_print=True)) if paper_node.attrib["id"] == "0": # create metadata subtree meta = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta.append(title_node) for editor in paper_node.findall("editor"): meta.append(editor) meta.append(paper_node.find("publisher")) meta.append(paper_node.find("address")) meta.append(paper_node.find("month")) meta.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", parent=meta) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) for attachment in paper["attachments"]: make_simple_element( "attachment", text=attachment.path, attrib={ "type": attachment.type, }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for paper in root_node.findall(".//paper"): for oldnode in paper: normalize(oldnode, informat="latex") # Ensure names are properly identified ambiguous = {} for paper in root_node.findall(".//paper"): anth_id = build_anthology_id(collection_id, paper.getparent().attrib["id"], paper.attrib["id"]) for node in chain(paper.findall("author"), paper.findall("editor")): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: print( f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}" ) ambiguous[anth_id] = (name, ids) node.attrib["id"] = ids[0] indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)