def main(args): print(f"Adding {args.award} to {args.anthology_id}...") collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is None: print(f"Error: Can't find paper {args.anthology_id}, quitting") existing_award = paper.find("./award") if existing_award is not None and award.text.lower() == args.award: print( f"Error: Award {args.award} already exists for {args.anthology_id}, quitting" ) make_simple_element("award", args.award, parent=paper) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def main(args): for xml_file in args.xml_files: tree = ET.parse(xml_file) for paper in tree.getroot().findall(f".//paper"): make_simple_element("language", "eng", parent=paper) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def add_video_tag(anth_paper, xml_parse): coll_id, vol_id, paper_id = deconstruct_anthology_id(anth_paper.anthology_id) paper = xml_parse.find(f'./volume[@id="{vol_id}"]/paper[@id="{paper_id}"]') if anth_paper.presentation_id.startswith("http"): video_url = anth_paper.presentation_id else: video_url = "https://slideslive.com/{}".format(anth_paper.presentation_id) make_simple_element("video", attrib={"tag": "video", "href": video_url}, parent=paper)
def add_doi(xml_node, collection_id, volume_id, force=False): if 'id' in xml_node.attrib: # normal paper paper_id = int(xml_node.attrib['id']) else: # frontmatter paper_id = 0 anth_id = build_anthology_id(collection_id, volume_id, paper_id) new_doi_text = f'{data.DOI_PREFIX}{anth_id}' doi_url = f'{data.DOI_URL_PREFIX}{data.DOI_PREFIX}{anth_id}' if not test_url(doi_url): print(f"-> [{anth_id}] Skipping since DOI {doi_url} doesn't exist") return False doi = xml_node.find('doi') if doi is not None: print( f'-> [{anth_id}] Cowardly refusing to overwrite existing DOI {doi.text} (use --force)', file=sys.stderr) return False else: doi = make_simple_element('doi', text=new_doi_text) print(f'Adding DOI {new_doi_text}', file=sys.stderr) xml_node.append(doi) return True
def add_doi(xml_node, collection_id, volume_id, force=False): if 'id' in xml_node.attrib: # normal paper paper_id = int(xml_node.attrib['id']) else: # frontmatter paper_id = 0 anth_id = build_anthology_id(collection_id, volume_id, paper_id) new_doi_text = f'{data.DOI_PREFIX}{anth_id}' doi = xml_node.find('doi') if doi is not None: print(f'-> [{anth_id}] Cowardly refusing to overwrite existing DOI {doi.text} (use --force)', file=sys.stderr) return False doi_url = f'{data.DOI_URL_PREFIX}{data.DOI_PREFIX}{anth_id}' for tries in [1, 2, 3]: # lots of random failures result = test_url_code(doi_url) if result.status_code == 200: doi = make_simple_element('doi', text=new_doi_text) print(f'-> Adding DOI {new_doi_text}', file=sys.stderr) xml_node.append(doi) return True elif result.status_code == 429: # too many requests pause_for = int(result.headers['Retry-After']) print(f'--> Got 429, pausing for {pause_for} seconds', file=sys.stderr) sleep(pause_for + 1) elif result.status_code == 404: # not found break print(f"-> Couldn't add DOI {doi_url}", file=sys.stderr) return False
def process_xml(xml: Path, is_tacl: bool) -> Optional[etree.Element]: logging.info("Reading {}".format(xml)) tree = etree.parse(str(xml)) root = tree.getroot() front = root.find("front") info, issue = get_article_journal_info(front, is_tacl) paper = etree.Element("paper") title_text = get_title(front) title = etree.Element("title") title.text = title_text paper.append(title) authors = get_authors(front) for given_names, surname in authors: first = etree.Element("first") first.text = given_names last = etree.Element("last") last.text = surname author = etree.Element("author") author.append(first) author.append(last) paper.append(author) doi_text = get_doi(front) doi = etree.Element("doi") doi.text = doi_text paper.append(doi) abstract_text = get_abstract(front) if abstract_text: make_simple_element("abstract", abstract_text, parent=paper) pages_tuple = get_pages(front) pages = etree.Element("pages") pages.text = "–".join(pages_tuple) # en-dash, not hyphen! paper.append(pages) return paper, info, issue
def write_bibkeys(anthology, srcdir, commit=False): for volume_id, volume in anthology.volumes.items(): papers_without_bibkey = [] for paper in volume: bibkey = paper.bibkey if bibkey is None or bibkey == paper.full_id: papers_without_bibkey.append(paper) if papers_without_bibkey: log.info( f"Found {len(papers_without_bibkey):4d} papers without bibkeys in volume {volume_id}" ) if not commit: continue else: continue # We got some new bibkeys and need to write them to the XML xml_file = os.path.join(srcdir, "xml", f"{volume.collection_id}.xml") tree = ET.parse(xml_file) root = tree.getroot() for paper in papers_without_bibkey: if paper.paper_id == "0": node = root.find( f"./volume[@id='{paper.volume_id}']/frontmatter") if node is None: # dummy frontmatter continue else: node = root.find( f"./volume[@id='{paper.volume_id}']/paper[@id='{paper.paper_id}']" ) if node is None: log.error(f"Paper {paper.full_id} not found in {xml_file}") continue # Generate unique bibkey bibkey = anthology.pindex.create_bibkey(paper, vidx=anthology.venues) make_simple_element("bibkey", bibkey, parent=node) indent(root) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def main(args): for line in args.isbn_file: venue, isbn = line.rstrip().split() xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{venue}.xml") if not os.path.exists(xml_file): print(f"Can't find {xml_file}") continue tree = ET.parse(xml_file) meta = tree.getroot().find(f".//volume[@id='1']/meta") if meta is not None and meta.find("./isbn") is None: print(f"Adding {isbn} to {venue} meta block") make_simple_element("isbn", isbn, parent=meta) elif volume.find("./isbn") is not None: print(f"{venue} already done") indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def main(args): """ Downloads an Anthology paper and adds a RETRACTED watermark, then updates the XML with an appropriate <revision> and <retracted> tag. """ with tempfile.TemporaryDirectory() as tempdir: new_pdf = add_watermark(args.anthology_id, workdir=tempdir) add_revision( args.anthology_id, new_pdf, explanation="Retracted.", change_type="revision", dry_run=False, ) xml_file = get_xml_file(args.anthology_id) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find( f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is None: print(f"Couldn't find paper {args.anthology_id}!", file=sys.stderr) sys.exit(2) print("Modifying the XML", file=sys.stderr) now = datetime.now() date = f"{now.year}-{now.month:02d}-{now.day:02d}" retracted_node = make_simple_element("retracted", args.explanation, attrib={"date": date}, parent=paper) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
# Normalize for paper in root_being_added.findall('.//paper'): for oldnode in paper: process(oldnode, informat='xml') # Ingest each volume. # First, find the XML file. collection_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') if os.path.exists(collection_file): existing_tree = etree.parse(collection_file) else: existing_tree = etree.ElementTree( make_simple_element('collection', attrib={'id': collection_id})) # Insert each volume for i, new_volume in enumerate(root_being_added.findall('volume')): new_volume_id = int(new_volume.attrib['id']) existing_volume = existing_tree.getroot().find( f"./volume[@id='{new_volume_id}']") if existing_volume is None: new_volume.attrib['ingest-date'] = args.ingest_date # Find the insertion point among the other volumes insertion_point = 0 for i, volume in enumerate(existing_tree.getroot()): if new_volume_id < int(volume.attrib['id']): break insertion_point = i + 1
def main(args): code, year, _ = os.path.basename(args.tsv_file.name).split(".") collection_id = f"{year}.{code}" tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) volume_id = "1" volume = make_simple_element("volume", attrib={"id": volume_id}) tree.getroot().insert(0, volume) # Create the metadata for the paper meta = None for row in csv.DictReader(args.meta_file, delimiter="\t"): if row["Conference code"] == collection_id: if row["Completed"] == "FALSE": prin( f"Warning: Conference {collection_id} is not marked as completed, can't ingest." ) sys.exit(1) meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row["Conference title"], parent=meta) make_simple_element("publisher", row["Publisher"], parent=meta) make_simple_element("address", row["Location"], parent=meta) make_simple_element("month", row["Dates held"], parent=meta) make_simple_element("year", row["Year"], parent=meta) if row["Editors"] != "" and "?" not in row["Editors"]: editors = row["Editors"].split(" and ") for editor_name in editors: editor = make_simple_element("editor", parent=meta) last, first = editor_name.split(", ") make_simple_element("first", first, parent=editor) make_simple_element("last", last, parent=editor) break else: print( f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}", file=sys.stderr) sys.exit(1) if not os.path.exists(collection_id): print(f"Creating {collection_id}", file=sys.stderr) os.makedirs(collection_id) # Create entries for all the papers for paperid, row in enumerate( csv.DictReader(args.tsv_file, delimiter='\t'), 1): title_text = row["Title"] author_list = row["Authors"].split(" and ") pdf = row["Pdf"] paper = make_simple_element("paper", attrib={"id": str(paperid)}, parent=volume) make_simple_element("title", title_text, parent=paper) for author_name in author_list: if author_name == "": continue author = make_simple_element("author", parent=paper) print(author_name) last, first = author_name.split(", ") make_simple_element("first", first, parent=author) make_simple_element("last", last, parent=author) url = f"{collection_id}-{volume_id}.{paperid}" pdf_local_path = os.path.join(collection_id, f"{url}.pdf") make_simple_element("url", url, parent=paper) download(pdf, pdf_local_path) if "Abstract" in row: make_simple_element("abstract", row["Abstract"], parent=paper) if "Presentation" in row: extension = row["Presentation"].split(".")[-1] filename = f"{collection_id}-{volume_id}.{paperid}.Presentation.{extension}" make_simple_element("attachment", filename, attrib={"type": "presentation"}) download(row["Presentation"], os.path.join(collection_id, filename)) indent(tree.getroot()) # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml collection_file = os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml") tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): def maybe_copy(file_from, file_to): if not args.dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) change_type = "erratum" if args.erratum else "revision" change_letter = "e" if args.erratum else "v" print(f"Processing {change_type} to {args.anthology_id}...") # TODO: make sure path exists, or download URL to temp file if args.path.startswith("http"): _, input_file_path = tempfile.mkstemp() try: print(f"-> Downloading file from {args.path}", file=sys.stderr) with urllib.request.urlopen(args.path) as url, open( input_file_path, mode="wb") as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: print("An SSL error was encountered in downloading the files.", file=sys.stderr) sys.exit(1) else: input_file_path = args.path detected = filetype.guess(input_file_path) if detected is None or not detected.mime.endswith(detected.extension): mime_type = 'UNKNOWN' if detected is None else detected.mime print( f"FATAL: {args.anthology_id} file {args.path} has MIME type {mime_type}", file=sys.stderr, ) sys.exit(1) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split(".")[-1] # The new version revno = None with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not args.dry_run: if not args.erratum and revno == 2: # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{args.anthology_id}{change_letter}1", "hash": checksum, }, parent=paper, ) revision = make_simple_element( change_type, args.explanation, attrib={ "id": str(revno), "href": f"{args.anthology_id}{change_letter}{revno}", "hash": checksum, "date": args.date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf") if not args.erratum and revno == 2: # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}1.pdf") current_version = ANTHOLOGY_PDF.format(args.anthology_id) if not args.dry_run: try: print( f"-> Downloading file from {args.path} to {revised_file_v1_path}", file=sys.stderr, ) with urllib.request.urlopen(current_version) as url, open( revised_file_v1_path, mode="wb") as fh: fh.write(url.read()) except ssl.SSLError: print( f"-> FATAL: An SSL error was encountered in downloading {args.path}.", file=sys.stderr, ) sys.exit(1) else: print( f"-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}", file=sys.stderr, ) revised_file_versioned_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(input_file_path, revised_file_versioned_path) # Copy it over the canonical path if not args.erratum: maybe_copy(input_file_path, canonical_path) if args.path.startswith("http"): os.remove(input_file_path)
def main(args): collections = defaultdict(OrderedDict) volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_index = VenueIndex(srcdir=anthology_datadir) venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] sig_index = SIGIndex(srcdir=anthology_datadir) # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) venue_abbrev = meta["abbrev"] venue_slug = venue_index.get_slug(venue_abbrev) if str(datetime.now().year) in venue_abbrev: print( f"Fatal: Venue assembler put year in acronym: '{venue_abbrev}'" ) sys.exit(1) if venue_slug not in venue_keys: unseen_venues.append((venue_slug, venue_abbrev, meta["title"])) meta["path"] = proceedings meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta if "sig" in meta: print( f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:" ) print(f" - {meta['year']}:") print(f" - {volume_full_id} # {meta['booktitle']}") # Make sure all venues exist if len(unseen_venues) > 0: for venue in unseen_venues: slug, abbrev, title = venue print(f"Creating venue '{abbrev}' ({title})") venue_index.add_venue(abbrev, title) venue_index.dump(directory=anthology_datadir) # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"].lower() year = meta["year"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) # copy the book book_src_filename = f'{year}.{meta["abbrev"]}-{volume_name}.pdf' book_src_path = os.path.join(root_path, book_src_filename) book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") if not args.dry_run: maybe_copy(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # Skip . files if os.path.basename(pdf_file).startswith("."): continue # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") if not args.dry_run: maybe_copy(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): if os.path.basename(attachment_file).startswith("."): continue attachment_file_path = os.path.join(root_path, "additional", attachment_file) match = re.match( rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file) if match is None: print( f"* Warning: no attachment match for {attachment_file}", file=sys.stderr, ) sys.exit(2) paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) if not args.dry_run and not os.path.exists(dest_path): log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) shutil.copyfile(attachment_file_path, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append((dest_path, type_)) people = AnthologyIndex(None, srcdir=anthology_datadir) def correct_caps(person, name_node, anth_id): """ Many people submit their names in "ALL CAPS" or "all lowercase". Correct this with heuristics. """ name = name_node.text if name.islower() or name.isupper(): # capitalize all parts corrected = " ".join( list(map(lambda x: x.capitalize(), name.split()))) print( f"-> Correcting capitalization of '{name}' to '{corrected}'", file=sys.stderr, ) name_node.text = corrected def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: choice = -1 while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) node.attrib["id"] = ids[choice] for collection_id, collection in collections.items(): # Newly added volumes, so we can normalize and name-disambig later newly_added_volumes = [] collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): root_node = etree.parse(collection_file).getroot() else: root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element( "volume", attrib={ "id": volume_id, "ingest-date": args.ingest_date }, ) # Replace the existing one if present existing_volume_node = root_node.find( f"./volume[@id='{volume_id}']") for i, child in enumerate(root_node): if child.attrib["id"] == volume_id: root_node[i] = volume_node break else: root_node.append(volume_node) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for author_or_editor in chain( paper_node.findall("./author"), paper_node.findall("./editor")): meta_node.append(author_or_editor) author_or_editor.tag = "editor" meta_node.append(paper_node.find("publisher")) meta_node.append(paper_node.find("address")) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={ "hash": compute_hash_from_file(book_dest_path) }, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for path, type_ in paper["attachments"]: make_simple_element( "attachment", text=os.path.basename(path), attrib={ "type": type_, "hash": compute_hash_from_file(path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for oldnode in paper_node: normalize(oldnode, informat="latex") # Adjust the language tag language_node = paper_node.find("./language") if language_node is not None: try: lang = iso639.languages.get(name=language_node.text) except KeyError: raise Exception( f"Can't find language '{language_node.text}'") language_node.text = lang.part3 print(language_node.text) # Fix author names for name_node in chain(paper_node.findall("./author"), paper_node.findall("./editor")): disambiguate_name(name_node, paper_id_full) person = PersonName.from_element(name_node) for name_part in name_node: correct_caps(person, name_part, paper_id_full) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
logging.basicConfig(level=args.verbose) is_tacl = "tacl" in args.year_root.stem venue = TACL if is_tacl else CL # J for CL, Q for TACL. year = args.year_root.stem.split(".")[1] year_suffix = year[-2:] # Feels hacky, too. collection_id = year + "." + venue collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): collection = etree.parse(collection_file).getroot() else: collection = make_simple_element("collection", attrib={"id": collection_id}) tacl_glob = "tacl.20*.*/tacl.20*.*.xml" # volume_info = get_volume_info(list(args.year_root.glob("*.*.*/*.*.*.xml"))[0]) # volume.append(volume_info) pdf_destination = Path(args.pdfs_dir) pdf_destination = pdf_destination / "pdf" / venue pdf_destination.mkdir(parents=True, exist_ok=True) previous_issue_info = None papers = [] for xml in sorted(args.year_root.glob("*_a_*/*.xml")): # print(xml)
def main(volumes): formatter = MarkupFormatter() ## Assemble container doi_batch = make_simple_element( 'doi_batch', attrib={ 'xmlns': 'http://www.crossref.org/schema/4.4.1', '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd', 'version': '4.4.1' }, namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}) new_volume = etree.ElementTree(doi_batch) ## Assemble head head = make_simple_element('head', parent=new_volume.getroot()) dbi = make_simple_element('doi_batch_id', text=str(int(time.time())), parent=head) timestamp = make_simple_element('timestamp', text=str(int(time.time())), parent=head) depositor = make_simple_element('depositor', parent=head) depositor_name = make_simple_element('depositor_name', text=DEPOSITOR_NAME, parent=depositor) email_address = make_simple_element('email_address', text=EMAIL_ADDRESS, parent=depositor) registrant = make_simple_element('registrant', text=REGISTRANT, parent=head) ## Assemble body body = make_simple_element('body', parent=new_volume.getroot()) year = "" start_month = "" end_month = "" for full_volume_id in sorted(volumes): collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id) collection_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = etree.parse(collection_file) v = tree.getroot().find(f"./volume[@id='{volume_id}']") if v is None: print(f"* Can't find volume {full_volume_id}", file=sys.stderr) continue ## Assemble frontmatter c = make_simple_element('conference', parent=body) contribs = make_simple_element('contributors', parent=c) editor_index = 0 meta = v.find('./meta') for tag in meta: if tag.tag == 'year': year = tag.text elif tag.tag == 'month': month = tag.text try: start_month = MONTH_HASH[re.split('[-–]', month)[0]] end_month = MONTH_HASH[re.split('[-–]', month)[1]] except IndexError as e: # only one month start_month = MONTH_HASH[month] end_month = MONTH_HASH[month] elif tag.tag == 'url': url = tag.text elif tag.tag == 'booktitle': booktitle = tag.text elif tag.tag == 'address': address = tag.text elif tag.tag == 'publisher': publisher = tag.text elif tag.tag == 'editor': pn = make_simple_element( 'person_name', parent=contribs, attrib={ 'contributor_role': 'chair', 'sequence': 'first' if editor_index == 0 else 'additional' }) editor_index += 1 for name_part in tag: if name_part.tag == 'first': gn = make_simple_element('given_name', parent=pn, text=name_part.text) elif name_part.tag == 'last': sn = make_simple_element('surname', text=name_part.text, parent=pn) # Assemble Event Metadata em = make_simple_element('event_metadata', parent=c) cn = make_simple_element('conference_name', parent=em, text=booktitle) cl = make_simple_element('conference_location', parent=em, text=address) cd = make_simple_element('conference_date', parent=em, attrib={ 'start_year': year, 'end_year': year, 'start_month': start_month, 'end_month': end_month }) # Assemble Proceedings Metadata pm = make_simple_element('proceedings_metadata', parent=c, attrib={'language': 'en'}) pt = make_simple_element('proceedings_title', parent=pm, text=booktitle) p = make_simple_element('publisher', parent=pm) pn = make_simple_element('publisher_name', parent=p, text=publisher) pp = make_simple_element('publisher_place', parent=p, text=PUBLISHER_PLACE) pd = make_simple_element('publication_date', parent=pm) y = make_simple_element('year', parent=pd, text=year) noisbn = make_simple_element('noisbn', parent=pm, attrib={'reason': 'simple_series'}) # DOI assignation data dd = make_simple_element('doi_data', parent=pm) doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url) resource = make_simple_element('resource', parent=dd, text=ANTHOLOGY_URL.format(url)) for paper in v.findall('./paper'): ## Individual Paper Data # TODO: this is not future-proof, should use anthology.util library functions aa_id = "" if (len(url) == 6): aa_id = '{:02d}'.format(int(paper.attrib['id'])) else: if (len(url) == 5): aa_id = '{:03d}'.format(int(paper.attrib['id'])) cp = make_simple_element('conference_paper', parent=c) # contributors contribs = make_simple_element('contributors', parent=cp) author_index = 0 for author in paper.findall('./author'): pn = make_simple_element( 'person_name', parent=contribs, attrib={ 'contributor_role': 'author', 'sequence': 'first' if author_index == 0 else 'additional' }) author_index += 1 for name_part in author: if name_part.tag == 'first': gn = make_simple_element('given_name', parent=pn, text=name_part.text) elif name_part.tag == 'last': sn = make_simple_element('surname', text=name_part.text, parent=pn) for title in paper.iter(tag='title'): o_titles = make_simple_element('titles', parent=cp) o_title = make_simple_element('title', parent=o_titles, text=formatter.as_text(title)) pd = make_simple_element('publication_date', parent=cp) o_year = make_simple_element('year', parent=pd) o_year.text = year for pages in paper.iter(tag='pages'): o_pages = make_simple_element('pages', parent=cp) fp = make_simple_element('first_page', parent=o_pages) lp = make_simple_element('last_page', parent=o_pages) try: fp.text = re.split('[-–]', pages.text)[0] lp.text = re.split('[-–]', pages.text)[1] except IndexError as e: # only one page fp.text = pages.text lp.text = pages.text # DOI assignation data dd = make_simple_element('doi_data', parent=cp) doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url + aa_id) resource = make_simple_element('resource', parent=dd, text=ANTHOLOGY_URL.format(url + aa_id)) print( etree.tostring(new_volume, pretty_print=True, encoding='UTF-8', xml_declaration=True, with_tail=True).decode('utf-8'))
def main(args): year, venue, _ = os.path.basename(args.tsv_file.name).split(".") # Set the volume name from the collection file, or default to 1 # The file name is either "2012.eamt.tsv" or "2012.eamt-main.tsv". # The default volume name is "1". if "-" in venue: venue, volume_id = venue.split("-") else: volume_id = "1" collection_id = f"{year}.{venue}" tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) now = datetime.now() today = f"{now.year}-{now.month:02d}-{now.day:02d}" volume = make_simple_element("volume", attrib={ "id": volume_id, "ingest-date": today }) tree.getroot().insert(0, volume) # Location of entire-proceedings PDF proceedings_pdf = args.proceedings # Create the metadata for the paper meta = None for row in csv.DictReader(args.meta_file, delimiter="\t"): current_collection_id = f"{row['Year']}.{row['Conference code']}" if current_collection_id == collection_id: meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row["Conference title"], parent=meta) make_simple_element("publisher", row["Publisher"], parent=meta) make_simple_element("address", row["Location"], parent=meta) make_simple_element("month", row["Dates held"], parent=meta) make_simple_element("year", row["Year"], parent=meta) url = row["URL"] if url.endswith(".pdf"): if proceedings_pdf: print( "Overriding --proceedings with proceedings PDF found in conference list", file=sys.stderr, ) proceedings_pdf = url elif "Complete PDF" in row and row["Complete PDF"] != "": proceedings_pdf = row["Complete PDF"] # volume PDF if proceedings_pdf is not None: volume_anth_id = f"{collection_id}-{volume_id}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{volume_anth_id}.pdf") download(proceedings_pdf, pdf_local_path) with open(pdf_local_path, "rb") as f: checksum = compute_hash(f.read()) make_simple_element("url", volume_anth_id, attrib={"hash": checksum}, parent=meta) proceedings_pdf = pdf_local_path if row["Editors"] != "" and "?" not in row["Editors"]: editors = row["Editors"].split(" and ") for editor_name in editors: editor = make_simple_element("editor", parent=meta) if ", " in editor_name: last, first = editor_name.split(", ") else: first, last = ( ' '.join(editor_name.split()[:-1]), editor_name.split()[-1], ) make_simple_element("first", first, parent=editor) make_simple_element("last", last, parent=editor) break else: print( f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}", file=sys.stderr, ) sys.exit(1) if not os.path.exists(collection_id): print(f"Creating {collection_id}", file=sys.stderr) os.makedirs(collection_id) paperid = 0 # Create entries for all the papers for row in csv.DictReader(args.tsv_file, delimiter='\t'): pages = row.get("Pagenumbers", None) title_text = row["Title"] # The first row might be front matter (needs a special name) if title_text == "Frontmatter" and paperid == 0: paper = make_simple_element("frontmatter", parent=volume) else: paperid += 1 paper = make_simple_element("paper", attrib={"id": str(paperid)}, parent=volume) # Only make the title for not-the-frontmatter make_simple_element("title", title_text, parent=paper) author_list = row["Authors"].split(" and ") for author_name in author_list: if author_name == "": continue author = make_simple_element("author", parent=paper) if ", " in author_name: last, first = author_name.split(", ") else: first, last = ' '.join( author_name.split()[:-1]), author_name.split()[-1] make_simple_element("first", first, parent=author) make_simple_element("last", last, parent=author) if pages is not None: make_simple_element("pages", pages, parent=paper) # Find the PDF, either listed directly, or extracted from the proceedings PDF anth_id = f"{collection_id}-{volume_id}.{paperid}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{anth_id}.pdf") url = None if "Pdf" in row and row["Pdf"] != "": if download(row["Pdf"], pdf_local_path): url = anth_id elif "pages in pdf" in row: pdf_pages = row["pages in pdf"] extract_pages(proceedings_pdf, pdf_pages, pdf_local_path) url = anth_id if url is not None: with open(pdf_local_path, "rb") as f: checksum = compute_hash(f.read()) make_simple_element("url", url, attrib={"hash": checksum}, parent=paper) if "Abstract" in row: make_simple_element("abstract", row["Abstract"], parent=paper) if "Presentation" in row: url = row["Presentation"] if url is not None and url != "" and url != "None": extension = row["Presentation"].split(".")[-1] name = f"{anth_id}.Presentation.{extension}" local_path = os.path.join( args.anthology_files_path, "..", "attachments", venue, name, ) if download(row["Presentation"], local_path): make_simple_element("attachment", name, attrib={"type": "presentation"}, parent=paper) # Normalize for node in paper: normalize(node, informat="latex") indent(tree.getroot()) # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml collection_file = os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml") tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): collections = defaultdict(OrderedDict) volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_keys = [ venue["slug"].lower() for _, venue in VenueIndex(srcdir=anthology_datadir).items() ] # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) venue_name = meta["abbrev"].lower() if venue_name not in venue_keys: unseen_venues.append(meta["abbrev"]) meta["path"] = proceedings meta["collection_id"] = collection_id = (meta["year"] + "." + meta["abbrev"].lower()) volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta # Make sure all venues exist if len(unseen_venues) > 0: print("FATAL: The following venue(s) don't exist in venues.yaml") for venue in unseen_venues: print(f"- {venue}") print("Please create entries for them and re-ingest.") sys.exit(1) # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"].lower() year = meta["year"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) # copy the book book_src_filename = meta["abbrev"] + "-" + year book_src_path = os.path.join(root_path, book_src_filename) + ".pdf" book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") if not args.dry_run and not os.path.exists(book_dest_path): log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run) shutil.copyfile(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") if not args.dry_run and not os.path.exists(pdf_dest_path): log(f"Copying {pdf_src_path} -> {pdf_dest_path}", args.dry_run) shutil.copyfile(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): attachment_file_path = os.path.join(root_path, "additional", attachment_file) match = re.match( rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file) if match is None: print( f"* Warning: no attachment match for {attachment_file}", file=sys.stderr, ) sys.exit(2) paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) if not args.dry_run and not os.path.exists(dest_path): log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) shutil.copyfile(attachment_file_path, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append((dest_path, type_)) people = AnthologyIndex(None, srcdir=anthology_datadir) def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: choice = -1 while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) node.attrib["id"] = ids[choice] for collection_id, collection in collections.items(): # Newly added volumes, so we can normalize and name-disambig later newly_added_volumes = [] collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): root_node = etree.parse(collection_file).getroot() else: root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element( "volume", attrib={ "id": volume_id, "ingest-date": args.ingest_date }, ) # Replace the existing one if present existing_volume_node = root_node.find( f"./volume[@id='{volume_id}']") for i, child in enumerate(root_node): if child.attrib["id"] == volume_id: root_node[i] = volume_node break else: root_node.append(volume_node) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) # print(etree.tostring(paper_node, pretty_print=True)) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for author_or_editor in chain( paper_node.findall("./author"), paper_node.findall("./editor")): meta_node.append(author_or_editor) author_or_editor.tag = "editor" meta_node.append(paper_node.find("publisher")) meta_node.append(paper_node.find("address")) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={ "hash": compute_hash_from_file(book_dest_path) }, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for path, type_ in paper["attachments"]: make_simple_element( "attachment", text=os.path.basename(path), attrib={ "type": type_, "hash": compute_hash_from_file(path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for oldnode in paper_node: normalize(oldnode, informat="latex") for name_node in chain(paper_node.findall("./author"), paper_node.findall("./editor")): disambiguate_name(name_node, paper_id_full) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): year = args.year venue = args.venue volume_id = args.volume collection_id = f"{year}.{venue}" splitter = NameSplitter(anthology_dir=args.anthology_dir) collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): tree = etree.parse(collection_file) else: tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) now = datetime.now() today = f"{now.year}-{now.month:02d}-{now.day:02d}" volume_node = tree.getroot().find(f"./volume[@id='{volume_id}']") if volume_node is not None: tree.getroot().remove(volume_node) volume = make_simple_element("volume", attrib={ "id": volume_id, "ingest-date": today }, parent=tree.getroot()) if not os.path.exists(collection_id): print(f"Creating {collection_id}", file=sys.stderr) os.makedirs(collection_id) # Create entries for all the papers for paperid, row in enumerate( csv.DictReader(args.tsv_file, delimiter=args.delimiter)): pages = row.get("pages", None) if paperid == 0: meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row["booktitle"], parent=meta) make_simple_element("publisher", row["publisher"], parent=meta) make_simple_element("address", row["address"], parent=meta) make_simple_element("month", row["month"], parent=meta) make_simple_element("year", year, parent=meta) editors = row["author"].split(" and ") row["author"] = "" for editor_name in editors: editor = make_simple_element("editor", parent=meta) surname, givenname = splitter.best_split(editor_name) make_simple_element("first", givenname, parent=editor) make_simple_element("last", surname, parent=editor) # volume PDF proceedings_pdf = args.proceedings_pdf if proceedings_pdf is not None: volume_anth_id = f"{collection_id}-{volume_id}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{volume_anth_id}.pdf") retrieve_url(proceedings_pdf, pdf_local_path) checksum = compute_hash_from_file(pdf_local_path) make_simple_element("url", volume_anth_id, attrib={"hash": checksum}, parent=meta) proceedings_pdf = pdf_local_path title_text = row["title"] # The first row might be front matter (needs a special name) if paperid == 0 and title_text.lower() in [ "frontmatter", "front matter" ]: paper = make_simple_element("frontmatter", parent=volume) else: if paperid == 0: # Not frontmatter, so paper 1 paperid += 1 paper = make_simple_element("paper", attrib={"id": str(paperid)}, parent=volume) # Only make the title for not-the-frontmatter make_simple_element("title", title_text, parent=paper) author_list = row["author"].split(" and ") for author_name in author_list: if author_name == "": continue author = make_simple_element("author", parent=paper) surname, givenname = splitter.best_split(author_name) make_simple_element("first", givenname, parent=author) make_simple_element("last", surname, parent=author) if pages is not None and pages != "": make_simple_element("pages", pages, parent=paper) # Find the PDF, either listed directly, or extracted from the proceedings PDF anth_id = f"{collection_id}-{volume_id}.{paperid}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{anth_id}.pdf") url = None if "pdf" in row and row["pdf"] != "": if retrieve_url(row["pdf"], pdf_local_path): url = anth_id else: print("Can't find", row["pdf"]) elif "pages in pdf" in row: pdf_pages = row["pages"] extract_pages(proceedings_pdf, pdf_pages, pdf_local_path) url = anth_id if url is not None: checksum = compute_hash_from_file(pdf_local_path) make_simple_element("url", url, attrib={"hash": checksum}, parent=paper) if "abstract" in row and row["abstract"] != "": make_simple_element("abstract", row["abstract"], parent=paper) if "presentation" in row: url = row["presentation"] if url is not None and url != "" and url != "None": extension = row["presentation"].split(".")[-1] name = f"{anth_id}.Presentation.{extension}" local_path = os.path.join( args.anthology_files_path, "..", "attachments", venue, name, ) if retrieve_url(row["presentation"], local_path): make_simple_element( "attachment", name, attrib={ "type": "presentation", "hash": compute_hash_from_file(local_path), }, parent=paper, ) # Normalize for node in paper: normalize(node, informat="latex") indent(tree.getroot()) # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): def maybe_copy(file_from, file_to): if not args.dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) change_type = "erratum" if args.erratum else "revision" change_letter = "e" if args.erratum else "v" print(f"Processing {change_type} to {args.anthology_id}...") # TODO: make sure path exists, or download URL to temp file if args.path.startswith("http"): _, input_file_path = tempfile.mkstemp() download_file(args.path, input_file_path) else: input_file_path = args.path validate_file_type(input_file_path) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split(".")[-1] # The new version revno = None with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* if is_newstyle_id(args.anthology_id): venue_name = collection_id.split(".")[1] output_dir = os.path.join(args.anthology_dir, "pdf", venue_name) else: output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf") # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not args.dry_run: # Update the URL hash on the <url> tag url = paper.find("./url") if url is not None: url.attrib["hash"] = checksum if not args.erratum and revno == 2: if paper.find("./url") is not None: current_version_url = infer_url( paper.find("./url").text) + ".pdf" # Download original file # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}1.pdf") download_file(current_version_url, revised_file_v1_path) validate_file_type(revised_file_v1_path) with open(revised_file_v1_path, "rb") as f: old_checksum = compute_hash(f.read()) # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{args.anthology_id}{change_letter}1", "hash": old_checksum, }, parent=paper, ) revision = make_simple_element( change_type, args.explanation, attrib={ "id": str(revno), "href": f"{args.anthology_id}{change_letter}{revno}", "hash": checksum, "date": args.date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) revised_file_versioned_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(input_file_path, revised_file_versioned_path) # Copy it over the canonical path if not args.erratum: maybe_copy(input_file_path, canonical_path) if args.path.startswith("http"): os.remove(input_file_path)
def main(volumes): formatter = MarkupFormatter() ## Assemble container doi_batch = make_simple_element( "doi_batch", attrib={ "xmlns": "http://www.crossref.org/schema/4.4.1", "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation": "http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd", "version": "4.4.1", }, namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"}, ) new_volume = etree.ElementTree(doi_batch) ## Assemble head head = make_simple_element("head", parent=new_volume.getroot()) dbi = make_simple_element("doi_batch_id", text=str(int(time.time())), parent=head) timestamp = make_simple_element("timestamp", text=str(int(time.time())), parent=head) depositor = make_simple_element("depositor", parent=head) depositor_name = make_simple_element("depositor_name", text=DEPOSITOR_NAME, parent=depositor) email_address = make_simple_element("email_address", text=EMAIL_ADDRESS, parent=depositor) registrant = make_simple_element("registrant", text=REGISTRANT, parent=head) ## Assemble body body = make_simple_element("body", parent=new_volume.getroot()) year = "" start_month = "" end_month = "" for full_volume_id in sorted(volumes): collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id) collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = etree.parse(collection_file) v = tree.getroot().find(f"./volume[@id='{volume_id}']") if v is None: print(f"* Can't find volume {full_volume_id}", file=sys.stderr) continue ## Assemble frontmatter c = make_simple_element("conference", parent=body) contribs = make_simple_element("contributors", parent=c) editor_index = 0 meta = v.find("./meta") for tag in meta: if tag.tag == "year": year = tag.text elif tag.tag == "month": month = tag.text try: start_month = MONTH_HASH[re.split("[-–]", month)[0]] end_month = MONTH_HASH[re.split("[-–]", month)[1]] except IndexError as e: # only one month start_month = MONTH_HASH[month] end_month = MONTH_HASH[month] except Exception as e: print( f"FATAL: can't parse month {month} in {full_volume_id}", file=sys.stderr, ) sys.exit(1) elif tag.tag == "url": url = tag.text elif tag.tag == "booktitle": booktitle = formatter.as_text(tag) elif tag.tag == "address": address = tag.text elif tag.tag == "publisher": publisher = tag.text elif tag.tag == "editor": pn = make_simple_element( "person_name", parent=contribs, attrib={ "contributor_role": "chair", "sequence": "first" if editor_index == 0 else "additional", }, ) editor_index += 1 for name_part in tag: if name_part.tag == "first": gn = make_simple_element("given_name", parent=pn, text=name_part.text) elif name_part.tag == "last": sn = make_simple_element("surname", text=name_part.text, parent=pn) # Assemble Event Metadata em = make_simple_element("event_metadata", parent=c) cn = make_simple_element("conference_name", parent=em, text=booktitle) cl = make_simple_element("conference_location", parent=em, text=address) cd = make_simple_element( "conference_date", parent=em, attrib={ "start_year": year, "end_year": year, "start_month": start_month, "end_month": end_month, }, ) # Assemble Proceedings Metadata pm = make_simple_element("proceedings_metadata", parent=c, attrib={"language": "en"}) pt = make_simple_element("proceedings_title", parent=pm, text=booktitle) p = make_simple_element("publisher", parent=pm) pn = make_simple_element("publisher_name", parent=p, text=publisher) pp = make_simple_element("publisher_place", parent=p, text=PUBLISHER_PLACE) pd = make_simple_element("publication_date", parent=pm) y = make_simple_element("year", parent=pd, text=year) noisbn = make_simple_element("noisbn", parent=pm, attrib={"reason": "simple_series"}) # DOI assignation data dd = make_simple_element("doi_data", parent=pm) doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url) resource = make_simple_element("resource", parent=dd, text=ANTHOLOGY_URL.format(url)) for paper in v.findall("./paper"): ## Individual Paper Data # TODO: this is not future-proof, should use anthology.util library functions aa_id = "" if len(url) == 6: aa_id = "{:02d}".format(int(paper.attrib["id"])) else: if len(url) == 5: aa_id = "{:03d}".format(int(paper.attrib["id"])) cp = make_simple_element("conference_paper", parent=c) # contributors contribs = make_simple_element("contributors", parent=cp) author_index = 0 for author in paper.findall("./author"): pn = make_simple_element( "person_name", parent=contribs, attrib={ "contributor_role": "author", "sequence": "first" if author_index == 0 else "additional", }, ) author_index += 1 for name_part in author: if name_part.tag == "first": gn = make_simple_element("given_name", parent=pn, text=name_part.text) elif name_part.tag == "last": sn = make_simple_element("surname", text=name_part.text, parent=pn) for title in paper.iter(tag="title"): o_titles = make_simple_element("titles", parent=cp) o_title = make_simple_element("title", parent=o_titles, text=formatter.as_text(title)) pd = make_simple_element("publication_date", parent=cp) o_year = make_simple_element("year", parent=pd) o_year.text = year for pages in paper.iter(tag="pages"): o_pages = make_simple_element("pages", parent=cp) fp = make_simple_element("first_page", parent=o_pages) lp = make_simple_element("last_page", parent=o_pages) try: fp.text = re.split("[-–]", pages.text)[0] lp.text = re.split("[-–]", pages.text)[1] except IndexError as e: # only one page fp.text = pages.text lp.text = pages.text # DOI assignation data dd = make_simple_element("doi_data", parent=cp) doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url + aa_id) resource = make_simple_element("resource", parent=dd, text=ANTHOLOGY_URL.format(url + aa_id)) print( etree.tostring( new_volume, pretty_print=True, encoding="UTF-8", xml_declaration=True, with_tail=True, ).decode("utf-8"))
def main(args): collections = defaultdict(OrderedDict) volumes = {} # Build list of volumes, confirm uniqueness for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) meta["path"] = proceedings meta["collection_id"] = collection_id = (meta["year"] + "." + meta["abbrev"].lower()) volume_name = meta["volume_name"] volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume_name"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) print(f"VOLUME: {volume}") # copy the book book_src_filename = meta["abbrev"] + "-" + meta["year"] book_src_path = os.path.join(root_path, book_src_filename) + ".pdf" book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run) if not args.dry_run: shutil.copyfile(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # names are {abbrev}{number}.pdf abbrev = meta["abbrev"] match = re.match(rf"{abbrev}(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") log( f"Copying [{paper_id_full}] {pdf_src_path} -> {pdf_dest_path}", args.dry_run, ) if not args.dry_run: shutil.copyfile(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, collection_id) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): match = re.match(rf"{abbrev}(\d+)_(\w+)\.(\w+)") if match is not None: paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) if not args.dry_run: shutil.copyfile(attachment_file, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append(dest_path) people = AnthologyIndex(None, srcdir=os.path.join(os.path.dirname(sys.argv[0]), "..", "data")) for collection_id, collection in collections.items(): collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element("volume", attrib={"id": volume_id}, parent=root_node) meta = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) # print(etree.tostring(paper_node, pretty_print=True)) if paper_node.attrib["id"] == "0": # create metadata subtree meta = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta.append(title_node) for editor in paper_node.findall("editor"): meta.append(editor) meta.append(paper_node.find("publisher")) meta.append(paper_node.find("address")) meta.append(paper_node.find("month")) meta.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", parent=meta) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) for attachment in paper["attachments"]: make_simple_element( "attachment", text=attachment.path, attrib={ "type": attachment.type, }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for paper in root_node.findall(".//paper"): for oldnode in paper: normalize(oldnode, informat="latex") # Ensure names are properly identified ambiguous = {} for paper in root_node.findall(".//paper"): anth_id = build_anthology_id(collection_id, paper.getparent().attrib["id"], paper.attrib["id"]) for node in chain(paper.findall("author"), paper.findall("editor")): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: print( f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}" ) ambiguous[anth_id] = (name, ids) node.attrib["id"] = ids[0] indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def bib2xml(bibfilename, anthology_id): """ Moved here from ACLPUB's anthology_xml.py script. """ fields = [ 'title', 'author', 'editor', 'booktitle', 'month', 'year', 'address', 'publisher', 'pages', 'abstract', 'url', 'doi', 'language', ] collection_id, volume_name, paper_no = deconstruct_anthology_id( anthology_id) if paper_no == '': return # skip the master bib file; we only process the individual files bibdata = read_bibtex(bibfilename) if len(bibdata.entries) != 1: log(f"more than one entry in {bibfilename}") bibkey, bibentry = bibdata.entries.items()[0] if len(bibentry.fields) == 0: log(f"parsing bib of paper {paper_no} failed") sys.exit(1) paper = make_simple_element("paper", attrib={"id": paper_no}) for field in list(bibentry.fields) + list(bibentry.persons): if field not in fields: log(f"unknown field {field}") for field in fields: if field in ['author', 'editor']: if field in bibentry.persons: for person in bibentry.persons[field]: first_text = ' '.join(person.bibtex_first_names) last_text = ' '.join(person.prelast_names + person.last_names) if person.lineage_names: last_text += ', ' + ' '.join(person.lineage_names) # Don't distinguish between authors that have only a first name # vs. authors that have only a last name; always make it a last name. if last_text.strip() in [ '', '-', ]: # Some START users have '-' for null last_text = first_text first_text = '' name_node = make_simple_element(field, parent=paper) make_simple_element("first", first_text, parent=name_node) make_simple_element("last", last_text, parent=name_node) else: if field == 'url': value = f"{anthology_id}" elif field in bibentry.fields: value = bibentry.fields[field] elif field == 'bibtype': value = bibentry.type elif field == 'bibkey': value = bibkey else: continue make_simple_element(field, text=value, parent=paper) return paper
def add_revision(anth_id, pdf_path, explanation, change_type="revision", dry_run=True, date=None): """ Takes an Anthology ID. It then adds a revision to the Anthology XML, updating and writing the XML file, and copies the PDFs into place. For PDFs, the revised PDF is saved to {anth_id}.pdf and {anth_id}v{version}.pdf. For the first revision, we first copy {anth_id}.pdf to {anth_id}v1.pdf. """ if date is None: now = datetime.now() date = f"{now.year}-{now.month:02d}-{now.day:02d}" def maybe_copy(file_from, file_to): if not dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) # The new version revno = None change_letter = "e" if change_type == "erratum" else "v" checksum = compute_hash_from_file(pdf_path) # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* output_dir = get_pdf_dir(anth_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{anth_id}.pdf") # Update XML xml_file = get_xml_file(anth_id) collection_id, volume_id, paper_id = deconstruct_anthology_id(anth_id) tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if change_type == "erratum" else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not dry_run: # Update the URL hash on the <url> tag url = paper.find("./url") if url is not None: url.attrib["hash"] = checksum if change_type == "revision" and revno == 2: if paper.find("./url") is not None: current_version_url = infer_url( paper.find("./url").text) + ".pdf" # Download original file # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{anth_id}{change_letter}1.pdf") retrieve_url(current_version_url, revised_file_v1_path) validate_file_type(revised_file_v1_path) old_checksum = compute_hash_from_file(revised_file_v1_path) # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{anth_id}{change_letter}1", "hash": old_checksum, }, parent=paper, ) revision = make_simple_element( change_type, explanation, attrib={ "id": str(revno), "href": f"{anth_id}{change_letter}{revno}", "hash": checksum, "date": date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {anth_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) revised_file_versioned_path = os.path.join( output_dir, f"{anth_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(pdf_path, revised_file_versioned_path) # Copy it over the canonical path if change_type == "revision": maybe_copy(pdf_path, canonical_path)
# Normalize for paper in root_being_added.findall(".//paper"): for oldnode in paper: normalize(oldnode, informat="latex") # Ingest each volume. # First, find the XML file. collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): existing_tree = etree.parse(collection_file) else: existing_tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) # Insert each volume for i, new_volume in enumerate(root_being_added.findall("volume")): new_volume_id = int(new_volume.attrib["id"]) existing_volume = existing_tree.getroot().find( f"./volume[@id='{new_volume_id}']") if existing_volume is None: new_volume.attrib["ingest-date"] = args.ingest_date # Find the insertion point among the other volumes insertion_point = 0 for i, volume in enumerate(existing_tree.getroot()): if new_volume_id < int(volume.attrib["id"]): break insertion_point = i + 1
def main(args): anth = anthology.Anthology(importdir=os.path.join(args.anthology, "data")) splitter = NameSplitter(anth) paper_nums = {} venue = "lilt" prev_year = None prev_volume = None for row in csv.DictReader(args.tsv_file, delimiter='\t'): year = row.get("year") month = row.get("month") issue = row.get("issue#", "") abstract = row.get("abstract") collection_id = f"{year}.lilt" if year != prev_year: if prev_year is not None: dump_collection( tree, os.path.join(args.anthology, "data", "xml", f"{prev_year}.lilt.xml"), ) tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) root = tree.getroot() prev_year = year volume_name = row.get("Volume#") if volume_name != prev_volume: volume = make_simple_element("volume", attrib={"id": volume_name}, parent=root) meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row.get("Booktitle"), parent=meta) make_simple_element("publisher", "CSLI Publications", parent=meta) make_simple_element("year", year, parent=meta) if month: make_simple_element("month", month, parent=meta) paper_num = paper_nums[volume_name] = paper_nums.get(volume_name, 0) + 1 prev_volume = volume_name paper = make_simple_element("paper", attrib={"id": str(paper_num)}, parent=volume) paper_id = f"{collection_id}-{volume_name}.{paper_num}" make_simple_element("title", row.get("title"), parent=paper) authors = row.get("authors") for author_name in authors.split(" and "): author = make_simple_element("author", parent=paper) surname, givenname = splitter.best_split(author_name) make_simple_element("first", givenname, parent=author) make_simple_element("last", surname, parent=author) if abstract != "": make_simple_element("abstract", abstract, parent=paper) if issue != "": make_simple_element("issue", issue, parent=paper) for node in paper: normalize(node, "latex") dest_dir = f"{args.anthology_files_path}/lilt" if not os.path.exists(dest_dir): os.makedirs(dest_dir) source_path = os.path.join( "pdf", row.get("PDF").replace("\\", "/").replace("../", "")) if os.path.exists(source_path): dest_path = os.path.join( dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") shutil.copy(source_path, dest_path) print(f"Copying {source_path} to {dest_path}", file=sys.stderr) os.chmod(dest_path, 0o644) checksum = compute_hash_from_file(dest_path) make_simple_element("url", paper_id, attrib={"hash": checksum}, parent=paper) dump_collection( tree, os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml"))