def main(datadir, filelist): to_check = defaultdict(list) for filename in filelist: basename = os.path.basename(filename) collection_id = basename.split("-")[0] with open(filename, "rb") as f: checksum = compute_hash(f.read()) to_check[collection_id].append((basename, checksum)) for collection_id, checklist in to_check.items(): xml_file = f"{datadir}/xml/{collection_id}.xml" tree = etree.parse(xml_file) root = tree.getroot() for filename, checksum in checklist: if filename.endswith(".pdf"): xpath = (f'//attachment[text()="{filename}"] | ' f'//url[text()="{filename[:-4]}"] | ' f'//erratum[text()="{filename[:-4]}"] | ' f'//revision[@href="{filename[:-4]}"]') else: xpath = f'//attachment[text()="{filename}"]' find = etree.XPath(xpath)(root) if not find: log.error( f"{filename}: couldn't find file in {collection_id}.xml") continue elif len(find) > 1: # this should never happen log.warning( f"{filename}: multiple entries with that name in {collection_id}.xml" ) expected = find[0].get("hash") if expected != checksum: log.error( f"{filename}: CRC32 mismatch -- {checksum} != {expected}") else: log.debug(f"{filename}: checksum verified")
def main(args): year, venue, _ = os.path.basename(args.tsv_file.name).split(".") # Set the volume name from the collection file, or default to 1 # The file name is either "2012.eamt.tsv" or "2012.eamt-main.tsv". # The default volume name is "1". if "-" in venue: venue, volume_id = venue.split("-") else: volume_id = "1" collection_id = f"{year}.{venue}" tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) now = datetime.now() today = f"{now.year}-{now.month:02d}-{now.day:02d}" volume = make_simple_element("volume", attrib={ "id": volume_id, "ingest-date": today }) tree.getroot().insert(0, volume) # Location of entire-proceedings PDF proceedings_pdf = args.proceedings # Create the metadata for the paper meta = None for row in csv.DictReader(args.meta_file, delimiter="\t"): current_collection_id = f"{row['Year']}.{row['Conference code']}" if current_collection_id == collection_id: meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row["Conference title"], parent=meta) make_simple_element("publisher", row["Publisher"], parent=meta) make_simple_element("address", row["Location"], parent=meta) make_simple_element("month", row["Dates held"], parent=meta) make_simple_element("year", row["Year"], parent=meta) url = row["URL"] if url.endswith(".pdf"): if proceedings_pdf: print( "Overriding --proceedings with proceedings PDF found in conference list", file=sys.stderr, ) proceedings_pdf = url elif "Complete PDF" in row and row["Complete PDF"] != "": proceedings_pdf = row["Complete PDF"] # volume PDF if proceedings_pdf is not None: volume_anth_id = f"{collection_id}-{volume_id}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{volume_anth_id}.pdf") download(proceedings_pdf, pdf_local_path) with open(pdf_local_path, "rb") as f: checksum = compute_hash(f.read()) make_simple_element("url", volume_anth_id, attrib={"hash": checksum}, parent=meta) proceedings_pdf = pdf_local_path if row["Editors"] != "" and "?" not in row["Editors"]: editors = row["Editors"].split(" and ") for editor_name in editors: editor = make_simple_element("editor", parent=meta) if ", " in editor_name: last, first = editor_name.split(", ") else: first, last = ( ' '.join(editor_name.split()[:-1]), editor_name.split()[-1], ) make_simple_element("first", first, parent=editor) make_simple_element("last", last, parent=editor) break else: print( f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}", file=sys.stderr, ) sys.exit(1) if not os.path.exists(collection_id): print(f"Creating {collection_id}", file=sys.stderr) os.makedirs(collection_id) paperid = 0 # Create entries for all the papers for row in csv.DictReader(args.tsv_file, delimiter='\t'): pages = row.get("Pagenumbers", None) title_text = row["Title"] # The first row might be front matter (needs a special name) if title_text == "Frontmatter" and paperid == 0: paper = make_simple_element("frontmatter", parent=volume) else: paperid += 1 paper = make_simple_element("paper", attrib={"id": str(paperid)}, parent=volume) # Only make the title for not-the-frontmatter make_simple_element("title", title_text, parent=paper) author_list = row["Authors"].split(" and ") for author_name in author_list: if author_name == "": continue author = make_simple_element("author", parent=paper) if ", " in author_name: last, first = author_name.split(", ") else: first, last = ' '.join( author_name.split()[:-1]), author_name.split()[-1] make_simple_element("first", first, parent=author) make_simple_element("last", last, parent=author) if pages is not None: make_simple_element("pages", pages, parent=paper) # Find the PDF, either listed directly, or extracted from the proceedings PDF anth_id = f"{collection_id}-{volume_id}.{paperid}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{anth_id}.pdf") url = None if "Pdf" in row and row["Pdf"] != "": if download(row["Pdf"], pdf_local_path): url = anth_id elif "pages in pdf" in row: pdf_pages = row["pages in pdf"] extract_pages(proceedings_pdf, pdf_pages, pdf_local_path) url = anth_id if url is not None: with open(pdf_local_path, "rb") as f: checksum = compute_hash(f.read()) make_simple_element("url", url, attrib={"hash": checksum}, parent=paper) if "Abstract" in row: make_simple_element("abstract", row["Abstract"], parent=paper) if "Presentation" in row: url = row["Presentation"] if url is not None and url != "" and url != "None": extension = row["Presentation"].split(".")[-1] name = f"{anth_id}.Presentation.{extension}" local_path = os.path.join( args.anthology_files_path, "..", "attachments", venue, name, ) if download(row["Presentation"], local_path): make_simple_element("attachment", name, attrib={"type": "presentation"}, parent=paper) # Normalize for node in paper: normalize(node, informat="latex") indent(tree.getroot()) # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml collection_file = os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml") tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
volume.attrib["id"] = issue_count collection.append(volume) # xml collection = journal volume volume.append( issue_info_to_node(issue_info, year, volume_id, issue_count, is_tacl) ) papernode.attrib["id"] = f"{i}" paper_id = get_paperid(xml, i, issue_count) pdf = xml.with_suffix(".pdf") if not pdf.is_file(): logging.error("Missing pdf for " + xml.name) elif args.pdf_save_destination: destination = write_to_here / "{}-{}.pdf".format(volume_id, paper_id) shutil.copyfile(pdf, destination) with open(pdf, "rb") as f: checksum = compute_hash(f.read()) url_text = STANDARD_URL.format(volume=volume_id, paper=paper_id) url = etree.Element("url") url.attrib["hash"] = checksum url.text = url_text papernode.append(url) if args.old_version: old_paper = old_root.find(f"*[@id='{paper_id}']") if old_paper is None: logging.error( f"No old version for {paper_id} with title {papernode.find('title').text}" ) else: old_video = old_paper.find("video")
def main(args): def maybe_copy(file_from, file_to): if not args.dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) change_type = "erratum" if args.erratum else "revision" change_letter = "e" if args.erratum else "v" print(f"Processing {change_type} to {args.anthology_id}...") # TODO: make sure path exists, or download URL to temp file if args.path.startswith("http"): _, input_file_path = tempfile.mkstemp() try: print(f"-> Downloading file from {args.path}", file=sys.stderr) with urllib.request.urlopen(args.path) as url, open( input_file_path, mode="wb") as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: print("An SSL error was encountered in downloading the files.", file=sys.stderr) sys.exit(1) else: input_file_path = args.path detected = filetype.guess(input_file_path) if detected is None or not detected.mime.endswith(detected.extension): mime_type = 'UNKNOWN' if detected is None else detected.mime print( f"FATAL: {args.anthology_id} file {args.path} has MIME type {mime_type}", file=sys.stderr, ) sys.exit(1) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split(".")[-1] # The new version revno = None with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not args.dry_run: if not args.erratum and revno == 2: # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{args.anthology_id}{change_letter}1", "hash": checksum, }, parent=paper, ) revision = make_simple_element( change_type, args.explanation, attrib={ "id": str(revno), "href": f"{args.anthology_id}{change_letter}{revno}", "hash": checksum, "date": args.date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf") if not args.erratum and revno == 2: # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}1.pdf") current_version = ANTHOLOGY_PDF.format(args.anthology_id) if not args.dry_run: try: print( f"-> Downloading file from {args.path} to {revised_file_v1_path}", file=sys.stderr, ) with urllib.request.urlopen(current_version) as url, open( revised_file_v1_path, mode="wb") as fh: fh.write(url.read()) except ssl.SSLError: print( f"-> FATAL: An SSL error was encountered in downloading {args.path}.", file=sys.stderr, ) sys.exit(1) else: print( f"-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}", file=sys.stderr, ) revised_file_versioned_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(input_file_path, revised_file_versioned_path) # Copy it over the canonical path if not args.erratum: maybe_copy(input_file_path, canonical_path) if args.path.startswith("http"): os.remove(input_file_path)
def add_attachment(anthology_id, path, attach_type, overwrite=False): """ Adds a single attachment to the Anthology data files. Arguments: - The ACL ID of the paper (e.g., P17-1012) - The path to the attachment (can be a URL) - The attachment type (poster, presentation, note, software) - Whether to overwrite the downloaded file. """ collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id) if path.startswith("http"): _, input_file_path = tempfile.mkstemp() try: print(f"-> Downloading file from {path} to {input_file_path}", file=sys.stderr) request = urllib.request.Request( path, headers={'User-Agent': 'Mozilla/5.0'}) with urllib.request.urlopen(request) as url, open( input_file_path, mode="wb") as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: raise Exception(f"Could not download {path}") except Exception as e: raise e else: input_file_path = path file_extension = path.replace("?dl=1", "").split(".")[-1] # Many links from file sharing services are not informative and don't have # extensions, so we could try to guess. if file_extension not in ALLOWED_TYPES: detected = filetype.guess(input_file_path) if detected is not None: file_extension = detected.mime.split("/")[-1] if file_extension not in ALLOWED_TYPES: print( f"Could not determine file extension for {anthology_id} at {path}", file=sys.stderr, ) with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) attachment_file_name = f"{anthology_id}.{attach_type}.{file_extension}" paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: # Check if attachment already exists for attachment in paper.findall("attachment"): if attachment.text == attachment_file_name: print( f"-> attachment {attachment_file_name} already exists in the XML", file=sys.stderr, ) break else: attachment = ET.Element("attachment") attachment.attrib["type"] = attach_type.lower() attachment.attrib["hash"] = checksum attachment.text = attachment_file_name paper.append(attachment) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f"-> added attachment {attachment_file_name} to the XML", file=sys.stderr) else: print(f"Paper {anthology_id} not found in the Anthology", file=sys.stderr) # Make sure directory exists output_dir = os.path.join(args.attachment_root, collection_id[0], collection_id) if not os.path.exists(output_dir): # print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) # Copy file dest_path = os.path.join(output_dir, attachment_file_name) if os.path.exists(dest_path) and not overwrite: print( f"-> target file {dest_path} already in place, refusing to overwrite", file=sys.stderr, ) return None shutil.copy(input_file_path, dest_path) os.chmod(dest_path, 0o644) print(f"-> copied {input_file_path} to {dest_path} and fixed perms", file=sys.stderr) # Clean up if path.startswith("http"): os.remove(input_file_path) return dest_path
def main(args): year = args.year venue = args.venue volume_id = args.volume collection_id = f"{year}.{venue}" splitter = NameSplitter() collection_file = os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): tree = etree.parse(collection_file) else: tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) now = datetime.now() today = f"{now.year}-{now.month:02d}-{now.day:02d}" volume_node = tree.getroot().find(f"./volume[@id='{volume_id}']") if volume_node is not None: tree.getroot().remove(volume_node) volume = make_simple_element("volume", attrib={ "id": volume_id, "ingest-date": today }, parent=tree.getroot()) if not os.path.exists(collection_id): print(f"Creating {collection_id}", file=sys.stderr) os.makedirs(collection_id) # Create entries for all the papers for paperid, row in enumerate( csv.DictReader(args.tsv_file, delimiter=args.delimiter)): pages = row.get("pages", None) if paperid == 0: meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row["booktitle"], parent=meta) make_simple_element("publisher", row["publisher"], parent=meta) make_simple_element("address", row["address"], parent=meta) make_simple_element("month", row["month"], parent=meta) make_simple_element("year", year, parent=meta) editors = row["author"].split(" and ") row["author"] = "" for editor_name in editors: editor = make_simple_element("editor", parent=meta) surname, givenname = splitter.best_split(editor_name) make_simple_element("first", givenname, parent=editor) make_simple_element("last", surname, parent=editor) # volume PDF proceedings_pdf = args.proceedings_pdf if proceedings_pdf is not None: volume_anth_id = f"{collection_id}-{volume_id}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{volume_anth_id}.pdf") retrieve_url(proceedings_pdf, pdf_local_path) with open(pdf_local_path, "rb") as f: checksum = compute_hash(f.read()) make_simple_element("url", volume_anth_id, attrib={"hash": checksum}, parent=meta) proceedings_pdf = pdf_local_path title_text = row["title"] if paperid == 0: # The first row might be front matter (needs a special name) if title_text.lower() in ["frontmatter", "front matter"]: paper = make_simple_element("frontmatter", parent=volume) else: if paperid == 0: # Not frontmatter, so paper 1 paperid += 1 paper = make_simple_element("paper", attrib={"id": str(paperid)}, parent=volume) # Only make the title for not-the-frontmatter make_simple_element("title", title_text, parent=paper) author_list = row["author"].split(" and ") for author_name in author_list: if author_name == "": continue author = make_simple_element("author", parent=paper) surname, givenname = splitter.best_split(author_name) make_simple_element("first", givenname, parent=author) make_simple_element("last", surname, parent=author) if pages is not None and pages != "": make_simple_element("pages", pages, parent=paper) # Find the PDF, either listed directly, or extracted from the proceedings PDF anth_id = f"{collection_id}-{volume_id}.{paperid}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{anth_id}.pdf") url = None if "pdf" in row and row["pdf"] != "": if retrieve_url(row["pdf"], pdf_local_path): url = anth_id elif "pages in pdf" in row: pdf_pages = row["pages"] extract_pages(proceedings_pdf, pdf_pages, pdf_local_path) url = anth_id if url is not None: with open(pdf_local_path, "rb") as f: checksum = compute_hash(f.read()) make_simple_element("url", url, attrib={"hash": checksum}, parent=paper) if "abstract" in row: make_simple_element("abstract", row["abstract"], parent=paper) if "presentation" in row: url = row["presentation"] if url is not None and url != "" and url != "None": extension = row["presentation"].split(".")[-1] name = f"{anth_id}.Presentation.{extension}" local_path = os.path.join( args.anthology_files_path, "..", "attachments", venue, name, ) if retrieve_url(row["presentation"], local_path): make_simple_element("attachment", name, attrib={"type": "presentation"}, parent=paper) # Normalize for node in paper: normalize(node, informat="latex") indent(tree.getroot()) # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): def maybe_copy(file_from, file_to): if not args.dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) change_type = "erratum" if args.erratum else "revision" change_letter = "e" if args.erratum else "v" print(f"Processing {change_type} to {args.anthology_id}...") # TODO: make sure path exists, or download URL to temp file if args.path.startswith("http"): _, input_file_path = tempfile.mkstemp() download_file(args.path, input_file_path) else: input_file_path = args.path validate_file_type(input_file_path) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split(".")[-1] # The new version revno = None with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* if is_newstyle_id(args.anthology_id): venue_name = collection_id.split(".")[1] output_dir = os.path.join(args.anthology_dir, "pdf", venue_name) else: output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf") # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not args.dry_run: # Update the URL hash on the <url> tag url = paper.find("./url") if url is not None: url.attrib["hash"] = checksum if not args.erratum and revno == 2: if paper.find("./url") is not None: current_version_url = infer_url( paper.find("./url").text) + ".pdf" # Download original file # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}1.pdf") download_file(current_version_url, revised_file_v1_path) validate_file_type(revised_file_v1_path) with open(revised_file_v1_path, "rb") as f: old_checksum = compute_hash(f.read()) # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{args.anthology_id}{change_letter}1", "hash": old_checksum, }, parent=paper, ) revision = make_simple_element( change_type, args.explanation, attrib={ "id": str(revno), "href": f"{args.anthology_id}{change_letter}{revno}", "hash": checksum, "date": args.date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) revised_file_versioned_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(input_file_path, revised_file_versioned_path) # Copy it over the canonical path if not args.erratum: maybe_copy(input_file_path, canonical_path) if args.path.startswith("http"): os.remove(input_file_path)