示例#1
0
def process_volume(anthology_volume):

    collection_id, volume_id, _ = deconstruct_anthology_id(anthology_volume)
    if is_newstyle_id(anthology_volume):
        venue_path = collection_id.split(".")[1]
    else:
        venue_path = os.path.join(collection_id[0], collection_id)

    print(f"Downloading PDFs for {anthology_volume}", file=sys.stderr)

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)

    for paper in tree.getroot().findall(f".//paper"):
        anthid = paper.find("./url").text

        # Try to get the URL from the Anthology
        if not test_url_code(infer_url(anthid)):
            doi = paper.find("./doi").text
            doi_pdf = f"https://www.mitpressjournals.org/doi/pdf/{doi}"
            local_path = os.path.join(args.anthology_files_dir, venue_path,
                                      f"{anthid}.pdf")
            if not os.path.exists(os.path.dirname(local_path)):
                os.makedirs(os.path.dirname(local_path))

            retrieve_url(doi_pdf, local_path)
            print(f"Saved {doi_pdf} to {local_path}")
            sleep(1)
        def __init__(self, acronym, letter, anth_id):
            self.parent_venue = acronym.lower()
            self.anth_id = anth_id

            collection_id, self.volume_id, _ = deconstruct_anthology_id(anth_id)
            if is_newstyle_id(collection_id):
                self.venue = collection_id.split(".")[1]
                self.is_parent_venue = self.venue == self.parent_venue
            else:
                self.venue = collection_id[0]
                self.is_parent_venue = self.venue == letter
示例#3
0
def main(args):
    def maybe_copy(file_from, file_to):
        if not args.dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    change_type = "erratum" if args.erratum else "revision"
    change_letter = "e" if args.erratum else "v"

    print(f"Processing {change_type} to {args.anthology_id}...")

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        download_file(args.path, input_file_path)
    else:
        input_file_path = args.path

    validate_file_type(input_file_path)

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split(".")[-1]

    # The new version
    revno = None

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Files for old-style IDs are stored under anthology-files/pdf/P/P19/*
    # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/*
    if is_newstyle_id(args.anthology_id):
        venue_name = collection_id.split(".")[1]
        output_dir = os.path.join(args.anthology_dir, "pdf", venue_name)
    else:
        output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0],
                                  collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf")

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    if paper_id == "0":
        paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter")
    else:
        paper = tree.getroot().find(
            f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not args.dry_run:
            # Update the URL hash on the <url> tag
            url = paper.find("./url")
            if url is not None:
                url.attrib["hash"] = checksum

            if not args.erratum and revno == 2:
                if paper.find("./url") is not None:
                    current_version_url = infer_url(
                        paper.find("./url").text) + ".pdf"

                # Download original file
                # There are no versioned files the first time around, so create the first one
                # (essentially backing up the original version)
                revised_file_v1_path = os.path.join(
                    output_dir, f"{args.anthology_id}{change_letter}1.pdf")

                download_file(current_version_url, revised_file_v1_path)
                validate_file_type(revised_file_v1_path)

                with open(revised_file_v1_path, "rb") as f:
                    old_checksum = compute_hash(f.read())

                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{args.anthology_id}{change_letter}1",
                        "hash": old_checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                args.explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{args.anthology_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": args.date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    revised_file_versioned_path = os.path.join(
        output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(input_file_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if not args.erratum:
        maybe_copy(input_file_path, canonical_path)

    if args.path.startswith("http"):
        os.remove(input_file_path)
示例#4
0
def main(volumes):

    formatter = MarkupFormatter()

    ## Assemble container
    doi_batch = make_simple_element(
        "doi_batch",
        attrib={
            "xmlns": "http://www.crossref.org/schema/4.4.1",
            "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation":
            "http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd",
            "version": "4.4.1",
        },
        namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"},
    )
    new_volume = etree.ElementTree(doi_batch)

    ## Assemble head
    head = make_simple_element("head", parent=new_volume.getroot())
    dbi = make_simple_element("doi_batch_id",
                              text=str(int(time.time())),
                              parent=head)

    timestamp = make_simple_element("timestamp",
                                    text=str(int(time.time())),
                                    parent=head)

    depositor = make_simple_element("depositor", parent=head)
    depositor_name = make_simple_element("depositor_name",
                                         text=DEPOSITOR_NAME,
                                         parent=depositor)
    email_address = make_simple_element("email_address",
                                        text=EMAIL_ADDRESS,
                                        parent=depositor)

    registrant = make_simple_element("registrant",
                                     text=REGISTRANT,
                                     parent=head)

    ## Assemble body
    body = make_simple_element("body", parent=new_volume.getroot())

    year = ""
    start_month = ""
    end_month = ""

    for full_volume_id in sorted(volumes):
        collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id)

        collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                       "data", "xml", f"{collection_id}.xml")
        tree = etree.parse(collection_file)

        v = tree.getroot().find(f"./volume[@id='{volume_id}']")
        if v is None:
            print(f"* Can't find volume {full_volume_id}", file=sys.stderr)
            continue

        ## Assemble frontmatter
        c = make_simple_element("conference", parent=body)
        contribs = make_simple_element("contributors", parent=c)
        editor_index = 0

        meta = v.find("./meta")
        for tag in meta:
            if tag.tag == "year":
                year = tag.text
            elif tag.tag == "month":
                month = tag.text
                try:
                    start_month = MONTH_HASH[re.split("[-–]", month)[0]]
                    end_month = MONTH_HASH[re.split("[-–]", month)[1]]
                except IndexError as e:  # only one month
                    start_month = MONTH_HASH[month]
                    end_month = MONTH_HASH[month]
                except Exception as e:
                    print(
                        f"FATAL: can't parse month {month} in {full_volume_id}",
                        file=sys.stderr,
                    )
                    sys.exit(1)
            elif tag.tag == "url":
                url = tag.text
            elif tag.tag == "booktitle":
                booktitle = formatter.as_text(tag)
            elif tag.tag == "address":
                address = tag.text
            elif tag.tag == "publisher":
                publisher = tag.text
            elif tag.tag == "editor":
                pn = make_simple_element(
                    "person_name",
                    parent=contribs,
                    attrib={
                        "contributor_role": "chair",
                        "sequence":
                        "first" if editor_index == 0 else "additional",
                    },
                )
                editor_index += 1

                for name_part in tag:
                    # Check if empty (e.g., "Mausam")
                    if name_part.tag == "first" and name_part.text != "":
                        gn = make_simple_element("given_name",
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == "last":
                        sn = make_simple_element("surname",
                                                 text=name_part.text,
                                                 parent=pn)

        # Assemble Event Metadata
        em = make_simple_element("event_metadata", parent=c)
        cn = make_simple_element("conference_name", parent=em, text=booktitle)
        cl = make_simple_element("conference_location",
                                 parent=em,
                                 text=address)
        cd = make_simple_element(
            "conference_date",
            parent=em,
            attrib={
                "start_year": year,
                "end_year": year,
                "start_month": start_month,
                "end_month": end_month,
            },
        )

        # Assemble Proceedings Metadata
        pm = make_simple_element("proceedings_metadata",
                                 parent=c,
                                 attrib={"language": "en"})
        pt = make_simple_element("proceedings_title",
                                 parent=pm,
                                 text=booktitle)
        p = make_simple_element("publisher", parent=pm)
        pn = make_simple_element("publisher_name", parent=p, text=publisher)
        pp = make_simple_element("publisher_place",
                                 parent=p,
                                 text=PUBLISHER_PLACE)
        pd = make_simple_element("publication_date", parent=pm)
        y = make_simple_element("year", parent=pd, text=year)
        noisbn = make_simple_element("noisbn",
                                     parent=pm,
                                     attrib={"reason": "simple_series"})

        # DOI assignation data
        dd = make_simple_element("doi_data", parent=pm)
        doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url)
        resource = make_simple_element("resource",
                                       parent=dd,
                                       text=ANTHOLOGY_URL.format(url))

        for paper in v.findall("./paper"):
            ## Individual Paper Data
            paper_id = paper.attrib["id"]
            if paper.find("./url") is not None:
                url = paper.find("./url").text
            else:
                if is_newstyle_id(full_volume_id):
                    url = f"{full_volume_id}.{paper_id}"
                elif len(full_volume_id) == 6:
                    url = f"{full_volume_id}{paper_id:02d}"
                elif len(full_volume_id) == 5:
                    url = f"{full_volume_id}{paper_id:03d}"

            cp = make_simple_element("conference_paper", parent=c)

            # contributors
            contribs = make_simple_element("contributors", parent=cp)
            author_index = 0
            for author in paper.findall("./author"):
                pn = make_simple_element(
                    "person_name",
                    parent=contribs,
                    attrib={
                        "contributor_role": "author",
                        "sequence":
                        "first" if author_index == 0 else "additional",
                    },
                )
                author_index += 1

                for name_part in author:
                    if name_part.tag == "first" and name_part.text != "":
                        gn = make_simple_element("given_name",
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == "last":
                        sn = make_simple_element("surname",
                                                 text=name_part.text,
                                                 parent=pn)

            for title in paper.iter(tag="title"):
                o_titles = make_simple_element("titles", parent=cp)
                o_title = make_simple_element("title",
                                              parent=o_titles,
                                              text=formatter.as_text(title))

            pd = make_simple_element("publication_date", parent=cp)
            o_year = make_simple_element("year", parent=pd)
            o_year.text = year

            for pages in paper.iter(tag="pages"):
                o_pages = make_simple_element("pages", parent=cp)
                fp = make_simple_element("first_page", parent=o_pages)
                lp = make_simple_element("last_page", parent=o_pages)
                try:
                    fp.text = re.split("[-–]", pages.text)[0]
                    lp.text = re.split("[-–]", pages.text)[1]
                except IndexError as e:  # only one page
                    fp.text = pages.text
                    lp.text = pages.text

            # DOI assignation data
            dd = make_simple_element("doi_data", parent=cp)
            doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url)
            resource = make_simple_element("resource",
                                           parent=dd,
                                           text=ANTHOLOGY_URL.format(url))

    print(
        etree.tostring(
            new_volume,
            pretty_print=True,
            encoding="UTF-8",
            xml_declaration=True,
            with_tail=True,
        ).decode("utf-8"))