"--ingest-date", "-d", type=str, default=today, help="Ingestion date as YYYY-MM-DD. Default: %(default)s.", ) parser.add_argument( "--append", "-a", action="store_true", help="Append to existing volume instead of quitting.", ) args = parser.parse_args() people = AnthologyIndex(None, srcdir=os.path.join(os.path.dirname(sys.argv[0]), "..", "data")) pdf_directory = os.path.dirname(args.infile) tree_being_added = etree.parse(args.infile) # Ensure nested format root_being_added = make_nested(tree_being_added.getroot(), pdf_path=os.path.dirname(args.infile)) collection_id = root_being_added.attrib["id"] # Ensure names are properly identified ambiguous = {} for paper in root_being_added.findall(".//paper"): anth_id = build_anthology_id(collection_id, paper.getparent().attrib["id"],
def main(args): collections = defaultdict(OrderedDict) volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_keys = [ venue["slug"].lower() for _, venue in VenueIndex(srcdir=anthology_datadir).items() ] # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) venue_name = meta["abbrev"].lower() if venue_name not in venue_keys: unseen_venues.append(meta["abbrev"]) meta["path"] = proceedings meta["collection_id"] = collection_id = (meta["year"] + "." + meta["abbrev"].lower()) volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta # Make sure all venues exist if len(unseen_venues) > 0: print("FATAL: The following venue(s) don't exist in venues.yaml") for venue in unseen_venues: print(f"- {venue}") print("Please create entries for them and re-ingest.") sys.exit(1) # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"].lower() year = meta["year"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) # copy the book book_src_filename = meta["abbrev"] + "-" + year book_src_path = os.path.join(root_path, book_src_filename) + ".pdf" book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") if not args.dry_run and not os.path.exists(book_dest_path): log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run) shutil.copyfile(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") if not args.dry_run and not os.path.exists(pdf_dest_path): log(f"Copying {pdf_src_path} -> {pdf_dest_path}", args.dry_run) shutil.copyfile(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): attachment_file_path = os.path.join(root_path, "additional", attachment_file) match = re.match( rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file) if match is None: print( f"* Warning: no attachment match for {attachment_file}", file=sys.stderr, ) sys.exit(2) paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) if not args.dry_run and not os.path.exists(dest_path): log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) shutil.copyfile(attachment_file_path, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append((dest_path, type_)) people = AnthologyIndex(None, srcdir=anthology_datadir) def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: choice = -1 while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) node.attrib["id"] = ids[choice] for collection_id, collection in collections.items(): # Newly added volumes, so we can normalize and name-disambig later newly_added_volumes = [] collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): root_node = etree.parse(collection_file).getroot() else: root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element( "volume", attrib={ "id": volume_id, "ingest-date": args.ingest_date }, ) # Replace the existing one if present existing_volume_node = root_node.find( f"./volume[@id='{volume_id}']") for i, child in enumerate(root_node): if child.attrib["id"] == volume_id: root_node[i] = volume_node break else: root_node.append(volume_node) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) # print(etree.tostring(paper_node, pretty_print=True)) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for author_or_editor in chain( paper_node.findall("./author"), paper_node.findall("./editor")): meta_node.append(author_or_editor) author_or_editor.tag = "editor" meta_node.append(paper_node.find("publisher")) meta_node.append(paper_node.find("address")) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={ "hash": compute_hash_from_file(book_dest_path) }, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for path, type_ in paper["attachments"]: make_simple_element( "attachment", text=os.path.basename(path), attrib={ "type": type_, "hash": compute_hash_from_file(path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for oldnode in paper_node: normalize(oldnode, informat="latex") for name_node in chain(paper_node.findall("./author"), paper_node.findall("./editor")): disambiguate_name(name_node, paper_id_full) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): collections = defaultdict(OrderedDict) volumes = {} # Build list of volumes, confirm uniqueness for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) meta["path"] = proceedings meta["collection_id"] = collection_id = (meta["year"] + "." + meta["abbrev"].lower()) volume_name = meta["volume_name"] volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume_name"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) print(f"VOLUME: {volume}") # copy the book book_src_filename = meta["abbrev"] + "-" + meta["year"] book_src_path = os.path.join(root_path, book_src_filename) + ".pdf" book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run) if not args.dry_run: shutil.copyfile(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # names are {abbrev}{number}.pdf abbrev = meta["abbrev"] match = re.match(rf"{abbrev}(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") log( f"Copying [{paper_id_full}] {pdf_src_path} -> {pdf_dest_path}", args.dry_run, ) if not args.dry_run: shutil.copyfile(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, collection_id) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): match = re.match(rf"{abbrev}(\d+)_(\w+)\.(\w+)") if match is not None: paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) if not args.dry_run: shutil.copyfile(attachment_file, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append(dest_path) people = AnthologyIndex(None, srcdir=os.path.join(os.path.dirname(sys.argv[0]), "..", "data")) for collection_id, collection in collections.items(): collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element("volume", attrib={"id": volume_id}, parent=root_node) meta = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) # print(etree.tostring(paper_node, pretty_print=True)) if paper_node.attrib["id"] == "0": # create metadata subtree meta = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta.append(title_node) for editor in paper_node.findall("editor"): meta.append(editor) meta.append(paper_node.find("publisher")) meta.append(paper_node.find("address")) meta.append(paper_node.find("month")) meta.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", parent=meta) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) for attachment in paper["attachments"]: make_simple_element( "attachment", text=attachment.path, attrib={ "type": attachment.type, }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for paper in root_node.findall(".//paper"): for oldnode in paper: normalize(oldnode, informat="latex") # Ensure names are properly identified ambiguous = {} for paper in root_node.findall(".//paper"): anth_id = build_anthology_id(collection_id, paper.getparent().attrib["id"], paper.attrib["id"]) for node in chain(paper.findall("author"), paper.findall("editor")): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: print( f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}" ) ambiguous[anth_id] = (name, ids) node.attrib["id"] = ids[0] indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): collections = defaultdict(OrderedDict) volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_index = VenueIndex(srcdir=anthology_datadir) venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] sig_index = SIGIndex(srcdir=anthology_datadir) # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) venue_abbrev = meta["abbrev"] venue_slug = venue_index.get_slug(venue_abbrev) if str(datetime.now().year) in venue_abbrev: print( f"Fatal: Venue assembler put year in acronym: '{venue_abbrev}'" ) sys.exit(1) if venue_slug not in venue_keys: unseen_venues.append((venue_slug, venue_abbrev, meta["title"])) meta["path"] = proceedings meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta if "sig" in meta: print( f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:" ) print(f" - {meta['year']}:") print(f" - {volume_full_id} # {meta['booktitle']}") # Make sure all venues exist if len(unseen_venues) > 0: for venue in unseen_venues: slug, abbrev, title = venue print(f"Creating venue '{abbrev}' ({title})") venue_index.add_venue(abbrev, title) venue_index.dump(directory=anthology_datadir) # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"].lower() year = meta["year"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) # copy the book book_src_filename = f'{year}.{meta["abbrev"]}-{volume_name}.pdf' book_src_path = os.path.join(root_path, book_src_filename) book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") if not args.dry_run: maybe_copy(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # Skip . files if os.path.basename(pdf_file).startswith("."): continue # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") if not args.dry_run: maybe_copy(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): if os.path.basename(attachment_file).startswith("."): continue attachment_file_path = os.path.join(root_path, "additional", attachment_file) match = re.match( rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file) if match is None: print( f"* Warning: no attachment match for {attachment_file}", file=sys.stderr, ) sys.exit(2) paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) if not args.dry_run and not os.path.exists(dest_path): log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) shutil.copyfile(attachment_file_path, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append((dest_path, type_)) people = AnthologyIndex(None, srcdir=anthology_datadir) def correct_caps(person, name_node, anth_id): """ Many people submit their names in "ALL CAPS" or "all lowercase". Correct this with heuristics. """ name = name_node.text if name.islower() or name.isupper(): # capitalize all parts corrected = " ".join( list(map(lambda x: x.capitalize(), name.split()))) print( f"-> Correcting capitalization of '{name}' to '{corrected}'", file=sys.stderr, ) name_node.text = corrected def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: choice = -1 while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) node.attrib["id"] = ids[choice] for collection_id, collection in collections.items(): # Newly added volumes, so we can normalize and name-disambig later newly_added_volumes = [] collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): root_node = etree.parse(collection_file).getroot() else: root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element( "volume", attrib={ "id": volume_id, "ingest-date": args.ingest_date }, ) # Replace the existing one if present existing_volume_node = root_node.find( f"./volume[@id='{volume_id}']") for i, child in enumerate(root_node): if child.attrib["id"] == volume_id: root_node[i] = volume_node break else: root_node.append(volume_node) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for author_or_editor in chain( paper_node.findall("./author"), paper_node.findall("./editor")): meta_node.append(author_or_editor) author_or_editor.tag = "editor" meta_node.append(paper_node.find("publisher")) meta_node.append(paper_node.find("address")) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={ "hash": compute_hash_from_file(book_dest_path) }, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for path, type_ in paper["attachments"]: make_simple_element( "attachment", text=os.path.basename(path), attrib={ "type": type_, "hash": compute_hash_from_file(path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for oldnode in paper_node: normalize(oldnode, informat="latex") # Adjust the language tag language_node = paper_node.find("./language") if language_node is not None: try: lang = iso639.languages.get(name=language_node.text) except KeyError: raise Exception( f"Can't find language '{language_node.text}'") language_node.text = lang.part3 print(language_node.text) # Fix author names for name_node in chain(paper_node.findall("./author"), paper_node.findall("./editor")): disambiguate_name(name_node, paper_id_full) person = PersonName.from_element(name_node) for name_part in name_node: correct_caps(person, name_part, paper_id_full) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
parser = argparse.ArgumentParser() parser.add_argument('infile') parser.add_argument( '--ingest-date', '-d', type=str, default=today, help='Ingestion date as YYYY-MM-DD. Default: %(default)s.') parser.add_argument('--append', '-a', action='store_true', help='Append to existing volume instead of quitting.') args = parser.parse_args() people = AnthologyIndex(None, srcdir=os.path.join(os.path.dirname(sys.argv[0]), '..', 'data')) tree_being_added = etree.parse(args.infile) # Ensure nested format root_being_added = make_nested(tree_being_added.getroot()) collection_id = root_being_added.attrib['id'] # Ensure names are properly identified ambiguous = {} for paper in root_being_added.findall('.//paper'): anth_id = build_anthology_id(collection_id, paper.getparent().attrib['id'], paper.attrib['id'])