Exemplo n.º 1
0
def main(datadir, filelist):
    to_check = defaultdict(list)

    for filename in filelist:
        basename = os.path.basename(filename)
        collection_id = basename.split("-")[0]
        with open(filename, "rb") as f:
            checksum = compute_hash(f.read())

        to_check[collection_id].append((basename, checksum))

    for collection_id, checklist in to_check.items():
        xml_file = f"{datadir}/xml/{collection_id}.xml"
        tree = etree.parse(xml_file)
        root = tree.getroot()

        for filename, checksum in checklist:
            if filename.endswith(".pdf"):
                xpath = (f'//attachment[text()="{filename}"] | '
                         f'//url[text()="{filename[:-4]}"] | '
                         f'//erratum[text()="{filename[:-4]}"] | '
                         f'//revision[@href="{filename[:-4]}"]')
            else:
                xpath = f'//attachment[text()="{filename}"]'

            find = etree.XPath(xpath)(root)
            if not find:
                log.error(
                    f"{filename}: couldn't find file in {collection_id}.xml")
                continue
            elif len(find) > 1:
                # this should never happen
                log.warning(
                    f"{filename}: multiple entries with that name in {collection_id}.xml"
                )

            expected = find[0].get("hash")
            if expected != checksum:
                log.error(
                    f"{filename}: CRC32 mismatch -- {checksum} != {expected}")
            else:
                log.debug(f"{filename}: checksum verified")
Exemplo n.º 2
0
def main(args):
    year, venue, _ = os.path.basename(args.tsv_file.name).split(".")

    # Set the volume name from the collection file, or default to 1
    # The file name is either "2012.eamt.tsv" or "2012.eamt-main.tsv".
    # The default volume name is "1".
    if "-" in venue:
        venue, volume_id = venue.split("-")
    else:
        volume_id = "1"

    collection_id = f"{year}.{venue}"

    tree = etree.ElementTree(
        make_simple_element("collection", attrib={"id": collection_id}))

    now = datetime.now()
    today = f"{now.year}-{now.month:02d}-{now.day:02d}"

    volume = make_simple_element("volume",
                                 attrib={
                                     "id": volume_id,
                                     "ingest-date": today
                                 })
    tree.getroot().insert(0, volume)

    # Location of entire-proceedings PDF
    proceedings_pdf = args.proceedings

    # Create the metadata for the paper
    meta = None
    for row in csv.DictReader(args.meta_file, delimiter="\t"):
        current_collection_id = f"{row['Year']}.{row['Conference code']}"
        if current_collection_id == collection_id:
            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle",
                                row["Conference title"],
                                parent=meta)
            make_simple_element("publisher", row["Publisher"], parent=meta)
            make_simple_element("address", row["Location"], parent=meta)
            make_simple_element("month", row["Dates held"], parent=meta)
            make_simple_element("year", row["Year"], parent=meta)

            url = row["URL"]

            if url.endswith(".pdf"):
                if proceedings_pdf:
                    print(
                        "Overriding --proceedings with proceedings PDF found in conference list",
                        file=sys.stderr,
                    )
                proceedings_pdf = url

            elif "Complete PDF" in row and row["Complete PDF"] != "":
                proceedings_pdf = row["Complete PDF"]

            # volume PDF
            if proceedings_pdf is not None:
                volume_anth_id = f"{collection_id}-{volume_id}"
                pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                              f"{volume_anth_id}.pdf")
                download(proceedings_pdf, pdf_local_path)
                with open(pdf_local_path, "rb") as f:
                    checksum = compute_hash(f.read())
                make_simple_element("url",
                                    volume_anth_id,
                                    attrib={"hash": checksum},
                                    parent=meta)
                proceedings_pdf = pdf_local_path

            if row["Editors"] != "" and "?" not in row["Editors"]:
                editors = row["Editors"].split(" and ")
                for editor_name in editors:
                    editor = make_simple_element("editor", parent=meta)
                    if ", " in editor_name:
                        last, first = editor_name.split(", ")
                    else:
                        first, last = (
                            ' '.join(editor_name.split()[:-1]),
                            editor_name.split()[-1],
                        )
                    make_simple_element("first", first, parent=editor)
                    make_simple_element("last", last, parent=editor)
            break
    else:
        print(
            f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}",
            file=sys.stderr,
        )
        sys.exit(1)

    if not os.path.exists(collection_id):
        print(f"Creating {collection_id}", file=sys.stderr)
        os.makedirs(collection_id)

    paperid = 0
    # Create entries for all the papers
    for row in csv.DictReader(args.tsv_file, delimiter='\t'):
        pages = row.get("Pagenumbers", None)

        title_text = row["Title"]

        # The first row might be front matter (needs a special name)
        if title_text == "Frontmatter" and paperid == 0:
            paper = make_simple_element("frontmatter", parent=volume)

        else:
            paperid += 1
            paper = make_simple_element("paper",
                                        attrib={"id": str(paperid)},
                                        parent=volume)
            # Only make the title for not-the-frontmatter
            make_simple_element("title", title_text, parent=paper)

        author_list = row["Authors"].split(" and ")

        for author_name in author_list:
            if author_name == "":
                continue
            author = make_simple_element("author", parent=paper)
            if ", " in author_name:
                last, first = author_name.split(", ")
            else:
                first, last = ' '.join(
                    author_name.split()[:-1]), author_name.split()[-1]
            make_simple_element("first", first, parent=author)
            make_simple_element("last", last, parent=author)

        if pages is not None:
            make_simple_element("pages", pages, parent=paper)

        # Find the PDF, either listed directly, or extracted from the proceedings PDF
        anth_id = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                      f"{anth_id}.pdf")
        url = None
        if "Pdf" in row and row["Pdf"] != "":
            if download(row["Pdf"], pdf_local_path):
                url = anth_id

        elif "pages in pdf" in row:
            pdf_pages = row["pages in pdf"]
            extract_pages(proceedings_pdf, pdf_pages, pdf_local_path)
            url = anth_id

        if url is not None:
            with open(pdf_local_path, "rb") as f:
                checksum = compute_hash(f.read())

            make_simple_element("url",
                                url,
                                attrib={"hash": checksum},
                                parent=paper)

        if "Abstract" in row:
            make_simple_element("abstract", row["Abstract"], parent=paper)

        if "Presentation" in row:
            url = row["Presentation"]
            if url is not None and url != "" and url != "None":
                extension = row["Presentation"].split(".")[-1]
                name = f"{anth_id}.Presentation.{extension}"
                local_path = os.path.join(
                    args.anthology_files_path,
                    "..",
                    "attachments",
                    venue,
                    name,
                )
                if download(row["Presentation"], local_path):
                    make_simple_element("attachment",
                                        name,
                                        attrib={"type": "presentation"},
                                        parent=paper)

        # Normalize
        for node in paper:
            normalize(node, informat="latex")

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    collection_file = os.path.join(args.anthology, "data", "xml",
                                   f"{collection_id}.xml")
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)
Exemplo n.º 3
0
            volume.attrib["id"] = issue_count
            collection.append(volume)  # xml collection = journal volume
            volume.append(
                issue_info_to_node(issue_info, year, volume_id, issue_count, is_tacl)
            )
        papernode.attrib["id"] = f"{i}"
        paper_id = get_paperid(xml, i, issue_count)

        pdf = xml.with_suffix(".pdf")
        if not pdf.is_file():
            logging.error("Missing pdf for " + xml.name)
        elif args.pdf_save_destination:
            destination = write_to_here / "{}-{}.pdf".format(volume_id, paper_id)
            shutil.copyfile(pdf, destination)
        with open(pdf, "rb") as f:
            checksum = compute_hash(f.read())

        url_text = STANDARD_URL.format(volume=volume_id, paper=paper_id)
        url = etree.Element("url")
        url.attrib["hash"] = checksum
        url.text = url_text
        papernode.append(url)

        if args.old_version:
            old_paper = old_root.find(f"*[@id='{paper_id}']")
            if old_paper is None:
                logging.error(
                    f"No old version for {paper_id} with title {papernode.find('title').text}"
                )
            else:
                old_video = old_paper.find("video")
Exemplo n.º 4
0
def main(args):
    def maybe_copy(file_from, file_to):
        if not args.dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    change_type = "erratum" if args.erratum else "revision"
    change_letter = "e" if args.erratum else "v"

    print(f"Processing {change_type} to {args.anthology_id}...")

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f"-> Downloading file from {args.path}", file=sys.stderr)
            with urllib.request.urlopen(args.path) as url, open(
                    input_file_path, mode="wb") as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            print("An SSL error was encountered in downloading the files.",
                  file=sys.stderr)
            sys.exit(1)
    else:
        input_file_path = args.path

    detected = filetype.guess(input_file_path)
    if detected is None or not detected.mime.endswith(detected.extension):
        mime_type = 'UNKNOWN' if detected is None else detected.mime
        print(
            f"FATAL: {args.anthology_id} file {args.path} has MIME type {mime_type}",
            file=sys.stderr,
        )
        sys.exit(1)

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split(".")[-1]

    # The new version
    revno = None

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not args.dry_run:
            if not args.erratum and revno == 2:
                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{args.anthology_id}{change_letter}1",
                        "hash": checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                args.explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{args.anthology_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": args.date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0],
                              collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf")

    if not args.erratum and revno == 2:
        # There are no versioned files the first time around, so create the first one
        # (essentially backing up the original version)
        revised_file_v1_path = os.path.join(
            output_dir, f"{args.anthology_id}{change_letter}1.pdf")

        current_version = ANTHOLOGY_PDF.format(args.anthology_id)
        if not args.dry_run:
            try:
                print(
                    f"-> Downloading file from {args.path} to {revised_file_v1_path}",
                    file=sys.stderr,
                )
                with urllib.request.urlopen(current_version) as url, open(
                        revised_file_v1_path, mode="wb") as fh:
                    fh.write(url.read())
            except ssl.SSLError:
                print(
                    f"-> FATAL: An SSL error was encountered in downloading {args.path}.",
                    file=sys.stderr,
                )
                sys.exit(1)
        else:
            print(
                f"-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}",
                file=sys.stderr,
            )

    revised_file_versioned_path = os.path.join(
        output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(input_file_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if not args.erratum:
        maybe_copy(input_file_path, canonical_path)

    if args.path.startswith("http"):
        os.remove(input_file_path)
Exemplo n.º 5
0
def add_attachment(anthology_id, path, attach_type, overwrite=False):
    """
    Adds a single attachment to the Anthology data files.

    Arguments:
    - The ACL ID of the paper (e.g., P17-1012)
    - The path to the attachment (can be a URL)
    - The attachment type (poster, presentation, note, software)
    - Whether to overwrite the downloaded file.
    """

    collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id)

    if path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f"-> Downloading file from {path} to {input_file_path}",
                  file=sys.stderr)
            request = urllib.request.Request(
                path, headers={'User-Agent': 'Mozilla/5.0'})
            with urllib.request.urlopen(request) as url, open(
                    input_file_path, mode="wb") as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            raise Exception(f"Could not download {path}")
        except Exception as e:
            raise e
    else:
        input_file_path = path

    file_extension = path.replace("?dl=1", "").split(".")[-1]
    # Many links from file sharing services are not informative and don't have
    # extensions, so we could try to guess.
    if file_extension not in ALLOWED_TYPES:
        detected = filetype.guess(input_file_path)
        if detected is not None:
            file_extension = detected.mime.split("/")[-1]
            if file_extension not in ALLOWED_TYPES:
                print(
                    f"Could not determine file extension for {anthology_id} at {path}",
                    file=sys.stderr,
                )

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)

    attachment_file_name = f"{anthology_id}.{attach_type}.{file_extension}"

    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        # Check if attachment already exists
        for attachment in paper.findall("attachment"):
            if attachment.text == attachment_file_name:
                print(
                    f"-> attachment {attachment_file_name} already exists in the XML",
                    file=sys.stderr,
                )
                break
        else:
            attachment = ET.Element("attachment")
            attachment.attrib["type"] = attach_type.lower()
            attachment.attrib["hash"] = checksum
            attachment.text = attachment_file_name

            paper.append(attachment)
            indent(tree.getroot())
            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f"-> added attachment {attachment_file_name} to the XML",
                  file=sys.stderr)

    else:
        print(f"Paper {anthology_id} not found in the Anthology",
              file=sys.stderr)

    # Make sure directory exists
    output_dir = os.path.join(args.attachment_root, collection_id[0],
                              collection_id)
    if not os.path.exists(output_dir):
        #        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    # Copy file
    dest_path = os.path.join(output_dir, attachment_file_name)
    if os.path.exists(dest_path) and not overwrite:
        print(
            f"-> target file {dest_path} already in place, refusing to overwrite",
            file=sys.stderr,
        )
        return None

    shutil.copy(input_file_path, dest_path)
    os.chmod(dest_path, 0o644)
    print(f"-> copied {input_file_path} to {dest_path} and fixed perms",
          file=sys.stderr)

    # Clean up
    if path.startswith("http"):
        os.remove(input_file_path)

    return dest_path
Exemplo n.º 6
0
def main(args):
    year = args.year
    venue = args.venue
    volume_id = args.volume
    collection_id = f"{year}.{venue}"

    splitter = NameSplitter()

    collection_file = os.path.join(args.anthology, "data", "xml",
                                   f"{collection_id}.xml")
    if os.path.exists(collection_file):
        tree = etree.parse(collection_file)
    else:
        tree = etree.ElementTree(
            make_simple_element("collection", attrib={"id": collection_id}))

    now = datetime.now()
    today = f"{now.year}-{now.month:02d}-{now.day:02d}"

    volume_node = tree.getroot().find(f"./volume[@id='{volume_id}']")
    if volume_node is not None:
        tree.getroot().remove(volume_node)

    volume = make_simple_element("volume",
                                 attrib={
                                     "id": volume_id,
                                     "ingest-date": today
                                 },
                                 parent=tree.getroot())

    if not os.path.exists(collection_id):
        print(f"Creating {collection_id}", file=sys.stderr)
        os.makedirs(collection_id)

    # Create entries for all the papers
    for paperid, row in enumerate(
            csv.DictReader(args.tsv_file, delimiter=args.delimiter)):
        pages = row.get("pages", None)

        if paperid == 0:
            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle", row["booktitle"], parent=meta)
            make_simple_element("publisher", row["publisher"], parent=meta)
            make_simple_element("address", row["address"], parent=meta)
            make_simple_element("month", row["month"], parent=meta)
            make_simple_element("year", year, parent=meta)

            editors = row["author"].split(" and ")
            row["author"] = ""
            for editor_name in editors:
                editor = make_simple_element("editor", parent=meta)
                surname, givenname = splitter.best_split(editor_name)
                make_simple_element("first", givenname, parent=editor)
                make_simple_element("last", surname, parent=editor)

            # volume PDF
            proceedings_pdf = args.proceedings_pdf
            if proceedings_pdf is not None:
                volume_anth_id = f"{collection_id}-{volume_id}"
                pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                              f"{volume_anth_id}.pdf")
                retrieve_url(proceedings_pdf, pdf_local_path)
                with open(pdf_local_path, "rb") as f:
                    checksum = compute_hash(f.read())
                make_simple_element("url",
                                    volume_anth_id,
                                    attrib={"hash": checksum},
                                    parent=meta)
                proceedings_pdf = pdf_local_path

        title_text = row["title"]

        if paperid == 0:
            # The first row might be front matter (needs a special name)
            if title_text.lower() in ["frontmatter", "front matter"]:
                paper = make_simple_element("frontmatter", parent=volume)
        else:
            if paperid == 0:
                # Not frontmatter, so paper 1
                paperid += 1

            paper = make_simple_element("paper",
                                        attrib={"id": str(paperid)},
                                        parent=volume)
            # Only make the title for not-the-frontmatter
            make_simple_element("title", title_text, parent=paper)

        author_list = row["author"].split(" and ")

        for author_name in author_list:
            if author_name == "":
                continue
            author = make_simple_element("author", parent=paper)
            surname, givenname = splitter.best_split(author_name)
            make_simple_element("first", givenname, parent=author)
            make_simple_element("last", surname, parent=author)

        if pages is not None and pages != "":
            make_simple_element("pages", pages, parent=paper)

        # Find the PDF, either listed directly, or extracted from the proceedings PDF
        anth_id = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                      f"{anth_id}.pdf")
        url = None
        if "pdf" in row and row["pdf"] != "":
            if retrieve_url(row["pdf"], pdf_local_path):
                url = anth_id

        elif "pages in pdf" in row:
            pdf_pages = row["pages"]
            extract_pages(proceedings_pdf, pdf_pages, pdf_local_path)
            url = anth_id

        if url is not None:
            with open(pdf_local_path, "rb") as f:
                checksum = compute_hash(f.read())

            make_simple_element("url",
                                url,
                                attrib={"hash": checksum},
                                parent=paper)

        if "abstract" in row:
            make_simple_element("abstract", row["abstract"], parent=paper)

        if "presentation" in row:
            url = row["presentation"]
            if url is not None and url != "" and url != "None":
                extension = row["presentation"].split(".")[-1]
                name = f"{anth_id}.Presentation.{extension}"
                local_path = os.path.join(
                    args.anthology_files_path,
                    "..",
                    "attachments",
                    venue,
                    name,
                )
                if retrieve_url(row["presentation"], local_path):
                    make_simple_element("attachment",
                                        name,
                                        attrib={"type": "presentation"},
                                        parent=paper)

        # Normalize
        for node in paper:
            normalize(node, informat="latex")

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)
Exemplo n.º 7
0
def main(args):
    def maybe_copy(file_from, file_to):
        if not args.dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    change_type = "erratum" if args.erratum else "revision"
    change_letter = "e" if args.erratum else "v"

    print(f"Processing {change_type} to {args.anthology_id}...")

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        download_file(args.path, input_file_path)
    else:
        input_file_path = args.path

    validate_file_type(input_file_path)

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split(".")[-1]

    # The new version
    revno = None

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Files for old-style IDs are stored under anthology-files/pdf/P/P19/*
    # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/*
    if is_newstyle_id(args.anthology_id):
        venue_name = collection_id.split(".")[1]
        output_dir = os.path.join(args.anthology_dir, "pdf", venue_name)
    else:
        output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0],
                                  collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf")

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    if paper_id == "0":
        paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter")
    else:
        paper = tree.getroot().find(
            f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not args.dry_run:
            # Update the URL hash on the <url> tag
            url = paper.find("./url")
            if url is not None:
                url.attrib["hash"] = checksum

            if not args.erratum and revno == 2:
                if paper.find("./url") is not None:
                    current_version_url = infer_url(
                        paper.find("./url").text) + ".pdf"

                # Download original file
                # There are no versioned files the first time around, so create the first one
                # (essentially backing up the original version)
                revised_file_v1_path = os.path.join(
                    output_dir, f"{args.anthology_id}{change_letter}1.pdf")

                download_file(current_version_url, revised_file_v1_path)
                validate_file_type(revised_file_v1_path)

                with open(revised_file_v1_path, "rb") as f:
                    old_checksum = compute_hash(f.read())

                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{args.anthology_id}{change_letter}1",
                        "hash": old_checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                args.explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{args.anthology_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": args.date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    revised_file_versioned_path = os.path.join(
        output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(input_file_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if not args.erratum:
        maybe_copy(input_file_path, canonical_path)

    if args.path.startswith("http"):
        os.remove(input_file_path)