Exemplo n.º 1
0
def upload_file(filepath: str):
    """
    Uploads regular PDFs or attachments to their correct place on the aclweb server.
    """

    relative_dest_path = ''

    os.chmod(filepath, 0o644)

    filename = os.path.basename(filepath)
    fileparts = filename.split('.')
    if len(fileparts) == 2:
        # e.g., P19-1001.pdf
        collection_id, volume_id, _ = deconstruct_anthology_id(fileparts[0])
        collection = collection_id[0]
        relative_dest_path = f'pdf/{collection}/{collection_id}/{filename}'

    elif len(fileparts) == 3:
        # e.g., P19-1001.Attachment.pdf
        collection_id, volume_id, _ = deconstruct_anthology_id(fileparts[0])
        relative_dest_path = f'attachments/{collection}/{collection_id}/{filename}'

    command = f'scp -q {filepath} aclweb:{ACLWEB_FILE_ROOT}/{relative_dest_path}'

    attempts = 1
    retcode = 1
    while attempts <= 3 and retcode != 0:
        # This fails sometimes for no reason, so try a couple of times
        retcode = subprocess.call(command, shell=True)
        if attempts > 1:
            print(f'-> Failed for some reason, attempt #{attempts}',
                  file=sys.stderr)
        print(f'{command} -> {retcode}', file=sys.stderr)
        attempts += 1
Exemplo n.º 2
0
def process_volume(anthology_volume):

    collection_id, volume_id, _ = deconstruct_anthology_id(anthology_volume)

    print(f'Attempting to add DOIs for {anthology_volume}', file=sys.stderr)

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml')
    tree = ET.parse(xml_file)

    formatter = MarkupFormatter()

    num_added = 0

    volume = tree.getroot().find(f"./volume[@id='{volume_id}']")
    if volume is not None:
        volume_booktitle = volume.find(f"./meta/booktitle")
        volume_title = formatter.as_text(volume_booktitle)
        print(f'-> found existing volume "{volume_title}"', file=sys.stderr)

        # Iterate through all papers
        for paper in chain(volume.find('frontmatter'), volume.findall('paper')):
            added = add_doi(paper, collection_id, volume_id, force=args.force)
            if added:
                num_added += 1
                sleep(1)

        indent(tree.getroot())

        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
        print(f'-> added {num_added} DOIs to to the XML for collection {collection_id}', file=sys.stderr)

    else:
        print(f'-> FATAL: volume {volume} not found in the Anthology', file=sys.stderr)
        sys.exit(1)
Exemplo n.º 3
0
def process_volume(anthology_volume):

    collection_id, volume_id, _ = deconstruct_anthology_id(anthology_volume)
    if is_newstyle_id(anthology_volume):
        venue_path = collection_id.split(".")[1]
    else:
        venue_path = os.path.join(collection_id[0], collection_id)

    print(f"Downloading PDFs for {anthology_volume}", file=sys.stderr)

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)

    for paper in tree.getroot().findall(f".//paper"):
        anthid = paper.find("./url").text

        # Try to get the URL from the Anthology
        if not test_url_code(infer_url(anthid)):
            doi = paper.find("./doi").text
            doi_pdf = f"https://www.mitpressjournals.org/doi/pdf/{doi}"
            local_path = os.path.join(args.anthology_files_dir, venue_path,
                                      f"{anthid}.pdf")
            if not os.path.exists(os.path.dirname(local_path)):
                os.makedirs(os.path.dirname(local_path))

            retrieve_url(doi_pdf, local_path)
            print(f"Saved {doi_pdf} to {local_path}")
            sleep(1)
Exemplo n.º 4
0
def main(args):
    print(f"Adding {args.award} to {args.anthology_id}...")

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is None:
        print(f"Error: Can't find paper {args.anthology_id}, quitting")

    existing_award = paper.find("./award")
    if existing_award is not None and award.text.lower() == args.award:
        print(
            f"Error: Award {args.award} already exists for {args.anthology_id}, quitting"
        )

    make_simple_element("award", args.award, parent=paper)
    indent(tree.getroot())

    tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
Exemplo n.º 5
0
def volume_sorter(volume_tuple):
    """
    Extracts the year so that we can sort by the year and then
    the collection ID.
    """
    volume_id = volume_tuple[0]
    collection_id, year, _ = deconstruct_anthology_id(volume_id)
    year = infer_year(collection_id)
    return year, volume_id
Exemplo n.º 6
0
    def remove_volume(self, full_volume_id):
        """
        Volumes with future ingestion dates are not built and may need to be removed from a SIG's listing.

        `full_volume_id` looks like `P19-1` or `W19-31`
        """
        collection_id, _, _ = deconstruct_anthology_id(full_volume_id)
        year = int(infer_year(collection_id))
        if year in self.events_by_year:
            self.events_by_year[year] = [event for event in self.events_by_year[year] if not event[0] == full_volume_id]
Exemplo n.º 7
0
def add_video_tag(anth_paper, xml_parse):
    coll_id, vol_id, paper_id = deconstruct_anthology_id(anth_paper.anthology_id)
    paper = xml_parse.find(f'./volume[@id="{vol_id}"]/paper[@id="{paper_id}"]')

    if anth_paper.presentation_id.startswith("http"):
        video_url = anth_paper.presentation_id
    else:
        video_url = "https://slideslive.com/{}".format(anth_paper.presentation_id)

    make_simple_element("video", attrib={"tag": "video", "href": video_url}, parent=paper)
def main(args):

    for lineno, line in enumerate(sys.stdin, 1):
        # attachments/D/D15/D15-1272.Attachment.pdf
        tokens = line.rstrip().split("/")
        attachment_file_name = tokens[-1]
        try:
            anth_id, kind, *rest = attachment_file_name.split(".")
        except:
            print(f"Couldn't parse file {attachment_file_name} into 3 pieces")
            continue

        try:
            collection_id, volume_id, paper_id = deconstruct_anthology_id(
                anth_id)
        except:
            print(f"[{lineno}] BAD LINE {line.rstrip()}")

        # Update XML
        xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data",
                                "xml", f"{collection_id}.xml")
        tree = ET.parse(xml_file)

        if int(paper_id) == 0:
            paper = tree.getroot().find(
                f"./volume[@id='{volume_id}']/frontmatter")
        else:
            paper = tree.getroot().find(
                f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
        if paper is not None:
            # Check if attachment already exists
            for attachment in paper.findall("attachment"):
                if attachment.text == attachment_file_name:
                    #                    print(f'-> attachment {attachment_file_name} already exists in the XML', file=sys.stderr)
                    break
            else:
                attachment = ET.Element("attachment")
                attachment.attrib["type"] = kind.lower()
                attachment.text = attachment_file_name

                paper.append(attachment)
                indent(tree.getroot())
                tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
                print(
                    f"-> [{lineno}] added attachment {attachment_file_name} to the XML",
                    file=sys.stderr,
                )

        else:
            print(
                f"-> FATAL: [{lineno}] paper ({anth_id}) not found in the Anthology",
                file=sys.stderr,
            )
            sys.exit(1)
Exemplo n.º 9
0
        def __init__(self, acronym, letter, anth_id):
            self.parent_venue = acronym.lower()
            self.anth_id = anth_id

            collection_id, self.volume_id, _ = deconstruct_anthology_id(anth_id)
            if is_newstyle_id(collection_id):
                self.venue = collection_id.split(".")[1]
                self.is_parent_venue = self.venue == self.parent_venue
            else:
                self.venue = collection_id[0]
                self.is_parent_venue = self.venue == letter
Exemplo n.º 10
0
def get_dest_path(filepath: str):
    """
    Returns the destination path on the remote server for the file.
    """
    dest_path = ""

    filename = os.path.basename(filepath)
    fileparts = filename.split(".")
    if len(fileparts) == 2:
        # e.g., P19-1001.pdf
        collection_id, volume_id, _ = deconstruct_anthology_id(fileparts[0])
        collection = collection_id[0]
        dest_path = f"pdf/{collection}/{collection_id}"

    elif len(fileparts) == 3:
        # e.g., P19-1001.Attachment.pdf
        collection_id, volume_id, _ = deconstruct_anthology_id(fileparts[0])
        collection = collection_id[0]
        dest_path = f"attachments/{collection}/{collection_id}"

    else:
        raise Exception(f"Can't determine target destination from {filepath}")

    return f"{ANTHOLOGY_HOST}:{ANTHOLOGY_FILE_ROOT}/{dest_path}"
Exemplo n.º 11
0
def main(args):
    """
    Downloads an Anthology paper and adds a RETRACTED watermark, then updates the XML
    with an appropriate <revision> and <retracted> tag.
    """

    with tempfile.TemporaryDirectory() as tempdir:

        new_pdf = add_watermark(args.anthology_id, workdir=tempdir)

        add_revision(
            args.anthology_id,
            new_pdf,
            explanation="Retracted.",
            change_type="revision",
            dry_run=False,
        )

        xml_file = get_xml_file(args.anthology_id)
        collection_id, volume_id, paper_id = deconstruct_anthology_id(
            args.anthology_id)
        tree = ET.parse(xml_file)
        if paper_id == "0":
            paper = tree.getroot().find(
                f"./volume[@id='{volume_id}']/frontmatter")
        else:
            paper = tree.getroot().find(
                f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")

        if paper is None:
            print(f"Couldn't find paper {args.anthology_id}!", file=sys.stderr)
            sys.exit(2)

        print("Modifying the XML", file=sys.stderr)
        now = datetime.now()
        date = f"{now.year}-{now.month:02d}-{now.day:02d}"
        retracted_node = make_simple_element("retracted",
                                             args.explanation,
                                             attrib={"date": date},
                                             parent=paper)
        indent(tree.getroot())
        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
Exemplo n.º 12
0
def split_anth_id(anth_id):
    coll_id, _, _ = deconstruct_anthology_id(anth_id)
    return coll_id
Exemplo n.º 13
0
def bib2xml(bibfilename, anthology_id):
    """
    Moved here from ACLPUB's anthology_xml.py script.
    """

    fields = [
        'title',
        'author',
        'editor',
        'booktitle',
        'month',
        'year',
        'address',
        'publisher',
        'pages',
        'abstract',
        'url',
        'doi',
        'language',
    ]

    collection_id, volume_name, paper_no = deconstruct_anthology_id(
        anthology_id)
    if paper_no == '':
        return  # skip the master bib file; we only process the individual files

    bibdata = read_bibtex(bibfilename)
    if len(bibdata.entries) != 1:
        log(f"more than one entry in {bibfilename}")
    bibkey, bibentry = bibdata.entries.items()[0]
    if len(bibentry.fields) == 0:
        log(f"parsing bib of paper {paper_no} failed")
        sys.exit(1)

    paper = make_simple_element("paper", attrib={"id": paper_no})
    for field in list(bibentry.fields) + list(bibentry.persons):
        if field not in fields:
            log(f"unknown field {field}")

    for field in fields:
        if field in ['author', 'editor']:
            if field in bibentry.persons:
                for person in bibentry.persons[field]:
                    first_text = ' '.join(person.bibtex_first_names)
                    last_text = ' '.join(person.prelast_names +
                                         person.last_names)
                    if person.lineage_names:
                        last_text += ', ' + ' '.join(person.lineage_names)

                    # Don't distinguish between authors that have only a first name
                    # vs. authors that have only a last name; always make it a last name.
                    if last_text.strip() in [
                            '',
                            '-',
                    ]:  # Some START users have '-' for null
                        last_text = first_text
                        first_text = ''

                    name_node = make_simple_element(field, parent=paper)
                    make_simple_element("first", first_text, parent=name_node)
                    make_simple_element("last", last_text, parent=name_node)
        else:
            if field == 'url':
                value = f"{anthology_id}"
            elif field in bibentry.fields:
                value = bibentry.fields[field]
            elif field == 'bibtype':
                value = bibentry.type
            elif field == 'bibkey':
                value = bibkey
            else:
                continue

            make_simple_element(field, text=value, parent=paper)

    return paper
Exemplo n.º 14
0
 def add_volume(self, full_volume_id):
     collection_id, _, _ = deconstruct_anthology_id(full_volume_id)
     year = int(infer_year(collection_id))
     if not year in self.events_by_year:
         self.events_by_year[year] = []
     self.events_by_year[year].append(full_volume_id)
def main(volumes):

    formatter = MarkupFormatter()

    ## Assemble container
    doi_batch = make_simple_element(
        "doi_batch",
        attrib={
            "xmlns": "http://www.crossref.org/schema/4.4.1",
            "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation":
            "http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd",
            "version": "4.4.1",
        },
        namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"},
    )
    new_volume = etree.ElementTree(doi_batch)

    ## Assemble head
    head = make_simple_element("head", parent=new_volume.getroot())
    dbi = make_simple_element("doi_batch_id",
                              text=str(int(time.time())),
                              parent=head)

    timestamp = make_simple_element("timestamp",
                                    text=str(int(time.time())),
                                    parent=head)

    depositor = make_simple_element("depositor", parent=head)
    depositor_name = make_simple_element("depositor_name",
                                         text=DEPOSITOR_NAME,
                                         parent=depositor)
    email_address = make_simple_element("email_address",
                                        text=EMAIL_ADDRESS,
                                        parent=depositor)

    registrant = make_simple_element("registrant",
                                     text=REGISTRANT,
                                     parent=head)

    ## Assemble body
    body = make_simple_element("body", parent=new_volume.getroot())

    year = ""
    start_month = ""
    end_month = ""

    for full_volume_id in sorted(volumes):
        collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id)

        collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                       "data", "xml", f"{collection_id}.xml")
        tree = etree.parse(collection_file)

        v = tree.getroot().find(f"./volume[@id='{volume_id}']")
        if v is None:
            print(f"* Can't find volume {full_volume_id}", file=sys.stderr)
            continue

        ## Assemble frontmatter
        c = make_simple_element("conference", parent=body)
        contribs = make_simple_element("contributors", parent=c)
        editor_index = 0

        meta = v.find("./meta")
        for tag in meta:
            if tag.tag == "year":
                year = tag.text
            elif tag.tag == "month":
                month = tag.text
                try:
                    start_month = MONTH_HASH[re.split("[-–]", month)[0]]
                    end_month = MONTH_HASH[re.split("[-–]", month)[1]]
                except IndexError as e:  # only one month
                    start_month = MONTH_HASH[month]
                    end_month = MONTH_HASH[month]
                except Exception as e:
                    print(
                        f"FATAL: can't parse month {month} in {full_volume_id}",
                        file=sys.stderr,
                    )
                    sys.exit(1)
            elif tag.tag == "url":
                url = tag.text
            elif tag.tag == "booktitle":
                booktitle = formatter.as_text(tag)
            elif tag.tag == "address":
                address = tag.text
            elif tag.tag == "publisher":
                publisher = tag.text
            elif tag.tag == "editor":
                pn = make_simple_element(
                    "person_name",
                    parent=contribs,
                    attrib={
                        "contributor_role": "chair",
                        "sequence":
                        "first" if editor_index == 0 else "additional",
                    },
                )
                editor_index += 1

                for name_part in tag:
                    if name_part.tag == "first":
                        gn = make_simple_element("given_name",
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == "last":
                        sn = make_simple_element("surname",
                                                 text=name_part.text,
                                                 parent=pn)

        # Assemble Event Metadata
        em = make_simple_element("event_metadata", parent=c)
        cn = make_simple_element("conference_name", parent=em, text=booktitle)
        cl = make_simple_element("conference_location",
                                 parent=em,
                                 text=address)
        cd = make_simple_element(
            "conference_date",
            parent=em,
            attrib={
                "start_year": year,
                "end_year": year,
                "start_month": start_month,
                "end_month": end_month,
            },
        )

        # Assemble Proceedings Metadata
        pm = make_simple_element("proceedings_metadata",
                                 parent=c,
                                 attrib={"language": "en"})
        pt = make_simple_element("proceedings_title",
                                 parent=pm,
                                 text=booktitle)
        p = make_simple_element("publisher", parent=pm)
        pn = make_simple_element("publisher_name", parent=p, text=publisher)
        pp = make_simple_element("publisher_place",
                                 parent=p,
                                 text=PUBLISHER_PLACE)
        pd = make_simple_element("publication_date", parent=pm)
        y = make_simple_element("year", parent=pd, text=year)
        noisbn = make_simple_element("noisbn",
                                     parent=pm,
                                     attrib={"reason": "simple_series"})

        # DOI assignation data
        dd = make_simple_element("doi_data", parent=pm)
        doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url)
        resource = make_simple_element("resource",
                                       parent=dd,
                                       text=ANTHOLOGY_URL.format(url))

        for paper in v.findall("./paper"):
            ## Individual Paper Data

            # TODO: this is not future-proof, should use anthology.util library functions
            aa_id = ""
            if len(url) == 6:
                aa_id = "{:02d}".format(int(paper.attrib["id"]))
            else:
                if len(url) == 5:
                    aa_id = "{:03d}".format(int(paper.attrib["id"]))

            cp = make_simple_element("conference_paper", parent=c)

            # contributors
            contribs = make_simple_element("contributors", parent=cp)
            author_index = 0
            for author in paper.findall("./author"):
                pn = make_simple_element(
                    "person_name",
                    parent=contribs,
                    attrib={
                        "contributor_role": "author",
                        "sequence":
                        "first" if author_index == 0 else "additional",
                    },
                )
                author_index += 1

                for name_part in author:
                    if name_part.tag == "first":
                        gn = make_simple_element("given_name",
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == "last":
                        sn = make_simple_element("surname",
                                                 text=name_part.text,
                                                 parent=pn)

            for title in paper.iter(tag="title"):
                o_titles = make_simple_element("titles", parent=cp)
                o_title = make_simple_element("title",
                                              parent=o_titles,
                                              text=formatter.as_text(title))

            pd = make_simple_element("publication_date", parent=cp)
            o_year = make_simple_element("year", parent=pd)
            o_year.text = year

            for pages in paper.iter(tag="pages"):
                o_pages = make_simple_element("pages", parent=cp)
                fp = make_simple_element("first_page", parent=o_pages)
                lp = make_simple_element("last_page", parent=o_pages)
                try:
                    fp.text = re.split("[-–]", pages.text)[0]
                    lp.text = re.split("[-–]", pages.text)[1]
                except IndexError as e:  # only one page
                    fp.text = pages.text
                    lp.text = pages.text

            # DOI assignation data
            dd = make_simple_element("doi_data", parent=cp)
            doi = make_simple_element("doi",
                                      parent=dd,
                                      text=DOI_PREFIX + url + aa_id)
            resource = make_simple_element("resource",
                                           parent=dd,
                                           text=ANTHOLOGY_URL.format(url +
                                                                     aa_id))

    print(
        etree.tostring(
            new_volume,
            pretty_print=True,
            encoding="UTF-8",
            xml_declaration=True,
            with_tail=True,
        ).decode("utf-8"))
Exemplo n.º 16
0
def main(args):
    def maybe_copy(file_from, file_to):
        if not args.dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    change_type = "erratum" if args.erratum else "revision"
    change_letter = "e" if args.erratum else "v"

    print(f"Processing {change_type} to {args.anthology_id}...")

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f"-> Downloading file from {args.path}", file=sys.stderr)
            with urllib.request.urlopen(args.path) as url, open(
                    input_file_path, mode="wb") as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            print("An SSL error was encountered in downloading the files.",
                  file=sys.stderr)
            sys.exit(1)
    else:
        input_file_path = args.path

    detected = filetype.guess(input_file_path)
    if detected is None or not detected.mime.endswith(detected.extension):
        mime_type = 'UNKNOWN' if detected is None else detected.mime
        print(
            f"FATAL: {args.anthology_id} file {args.path} has MIME type {mime_type}",
            file=sys.stderr,
        )
        sys.exit(1)

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split(".")[-1]

    # The new version
    revno = None

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not args.dry_run:
            if not args.erratum and revno == 2:
                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{args.anthology_id}{change_letter}1",
                        "hash": checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                args.explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{args.anthology_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": args.date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0],
                              collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf")

    if not args.erratum and revno == 2:
        # There are no versioned files the first time around, so create the first one
        # (essentially backing up the original version)
        revised_file_v1_path = os.path.join(
            output_dir, f"{args.anthology_id}{change_letter}1.pdf")

        current_version = ANTHOLOGY_PDF.format(args.anthology_id)
        if not args.dry_run:
            try:
                print(
                    f"-> Downloading file from {args.path} to {revised_file_v1_path}",
                    file=sys.stderr,
                )
                with urllib.request.urlopen(current_version) as url, open(
                        revised_file_v1_path, mode="wb") as fh:
                    fh.write(url.read())
            except ssl.SSLError:
                print(
                    f"-> FATAL: An SSL error was encountered in downloading {args.path}.",
                    file=sys.stderr,
                )
                sys.exit(1)
        else:
            print(
                f"-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}",
                file=sys.stderr,
            )

    revised_file_versioned_path = os.path.join(
        output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(input_file_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if not args.erratum:
        maybe_copy(input_file_path, canonical_path)

    if args.path.startswith("http"):
        os.remove(input_file_path)
Exemplo n.º 17
0
def main(args):

    print(f'Processing attachment for {args.anthology_id}', file=sys.stderr)

    if args.path.startswith('http'):
        _, input_file_path = tempfile.mkstemp()
        try:
            print('-> Downloading file from {}'.format(args.path),
                  file=sys.stderr)
            with urllib.request.urlopen(args.path) as url, open(
                    input_file_path, mode='wb') as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            print(
                '-> FATAL: An SSL error was encountered in downloading the files.',
                file=sys.stderr)
            sys.exit(1)
    else:
        input_file_path = args.path

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split('.')[-1]

    if paper_extension not in ALLOWED_TYPES:
        print(
            f'-> FATAL: {args.anthology_id} unknown file extension {paper_extension}',
            file=sys.stderr)
        sys.exit(1)

    attachment_file_name = f'{args.anthology_id}.{args.type}.{paper_extension}'

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml',
                            f'{collection_id}.xml')
    tree = ET.parse(xml_file)

    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        # Check if attachment already exists
        for attachment in paper.findall('attachment'):
            if attachment.text == attachment_file_name:
                print(
                    f'-> attachment {attachment_file_name} already exists in the XML',
                    file=sys.stderr)
                break
        else:
            attachment = ET.Element('attachment')
            attachment.attrib['type'] = args.type.lower()
            attachment.text = attachment_file_name

            paper.append(attachment)
            indent(tree.getroot())
            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> added attachment {attachment_file_name} to the XML',
                  file=sys.stderr)

    else:
        print(
            f'-> FATAL: paper (volume={volume_id}, paper={paper_id}) not found in the Anthology',
            file=sys.stderr)
        sys.exit(1)

    # Make sure directory exists
    output_dir = os.path.join(args.attachment_root, collection_id[0],
                              collection_id)
    if not os.path.exists(output_dir):
        print(f'-> Creating directory {output_dir}', file=sys.stderr)
        os.makedirs(output_dir)

    # Copy file
    dest_path = os.path.join(output_dir, attachment_file_name)
    if os.path.exists(dest_path):
        print(
            f'-> target file {dest_path} already in place, refusing to overwrite',
            file=sys.stderr)
    else:
        shutil.copy(input_file_path, dest_path)
        os.chmod(dest_path, 0o644)
        print(f'-> copied {input_file_path} to {dest_path} and fixed perms',
              file=sys.stderr)

    # Clean up
    if args.path.startswith('http'):
        os.remove(input_file_path)
Exemplo n.º 18
0
def main(volumes):

    formatter = MarkupFormatter()

    ## Assemble container
    doi_batch = make_simple_element(
        'doi_batch',
        attrib={
            'xmlns': 'http://www.crossref.org/schema/4.4.1',
            '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation':
            'http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd',
            'version': '4.4.1'
        },
        namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'})
    new_volume = etree.ElementTree(doi_batch)

    ## Assemble head
    head = make_simple_element('head', parent=new_volume.getroot())
    dbi = make_simple_element('doi_batch_id',
                              text=str(int(time.time())),
                              parent=head)

    timestamp = make_simple_element('timestamp',
                                    text=str(int(time.time())),
                                    parent=head)

    depositor = make_simple_element('depositor', parent=head)
    depositor_name = make_simple_element('depositor_name',
                                         text=DEPOSITOR_NAME,
                                         parent=depositor)
    email_address = make_simple_element('email_address',
                                        text=EMAIL_ADDRESS,
                                        parent=depositor)

    registrant = make_simple_element('registrant',
                                     text=REGISTRANT,
                                     parent=head)

    ## Assemble body
    body = make_simple_element('body', parent=new_volume.getroot())

    year = ""
    start_month = ""
    end_month = ""

    for full_volume_id in sorted(volumes):
        collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id)

        collection_file = os.path.join(os.path.dirname(sys.argv[0]), '..',
                                       'data', 'xml', f'{collection_id}.xml')
        tree = etree.parse(collection_file)

        v = tree.getroot().find(f"./volume[@id='{volume_id}']")
        if v is None:
            print(f"* Can't find volume {full_volume_id}", file=sys.stderr)
            continue

        ## Assemble frontmatter
        c = make_simple_element('conference', parent=body)
        contribs = make_simple_element('contributors', parent=c)
        editor_index = 0

        meta = v.find('./meta')
        for tag in meta:
            if tag.tag == 'year':
                year = tag.text
            elif tag.tag == 'month':
                month = tag.text
                try:
                    start_month = MONTH_HASH[re.split('[-–]', month)[0]]
                    end_month = MONTH_HASH[re.split('[-–]', month)[1]]
                except IndexError as e:  # only one month
                    start_month = MONTH_HASH[month]
                    end_month = MONTH_HASH[month]
            elif tag.tag == 'url':
                url = tag.text
            elif tag.tag == 'booktitle':
                booktitle = tag.text
            elif tag.tag == 'address':
                address = tag.text
            elif tag.tag == 'publisher':
                publisher = tag.text
            elif tag.tag == 'editor':
                pn = make_simple_element(
                    'person_name',
                    parent=contribs,
                    attrib={
                        'contributor_role': 'chair',
                        'sequence':
                        'first' if editor_index == 0 else 'additional'
                    })
                editor_index += 1

                for name_part in tag:
                    if name_part.tag == 'first':
                        gn = make_simple_element('given_name',
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == 'last':
                        sn = make_simple_element('surname',
                                                 text=name_part.text,
                                                 parent=pn)

        # Assemble Event Metadata
        em = make_simple_element('event_metadata', parent=c)
        cn = make_simple_element('conference_name', parent=em, text=booktitle)
        cl = make_simple_element('conference_location',
                                 parent=em,
                                 text=address)
        cd = make_simple_element('conference_date',
                                 parent=em,
                                 attrib={
                                     'start_year': year,
                                     'end_year': year,
                                     'start_month': start_month,
                                     'end_month': end_month
                                 })

        # Assemble Proceedings Metadata
        pm = make_simple_element('proceedings_metadata',
                                 parent=c,
                                 attrib={'language': 'en'})
        pt = make_simple_element('proceedings_title',
                                 parent=pm,
                                 text=booktitle)
        p = make_simple_element('publisher', parent=pm)
        pn = make_simple_element('publisher_name', parent=p, text=publisher)
        pp = make_simple_element('publisher_place',
                                 parent=p,
                                 text=PUBLISHER_PLACE)
        pd = make_simple_element('publication_date', parent=pm)
        y = make_simple_element('year', parent=pd, text=year)
        noisbn = make_simple_element('noisbn',
                                     parent=pm,
                                     attrib={'reason': 'simple_series'})

        # DOI assignation data
        dd = make_simple_element('doi_data', parent=pm)
        doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url)
        resource = make_simple_element('resource',
                                       parent=dd,
                                       text=ANTHOLOGY_URL.format(url))

        for paper in v.findall('./paper'):
            ## Individual Paper Data

            # TODO: this is not future-proof, should use anthology.util library functions
            aa_id = ""
            if (len(url) == 6):
                aa_id = '{:02d}'.format(int(paper.attrib['id']))
            else:
                if (len(url) == 5):
                    aa_id = '{:03d}'.format(int(paper.attrib['id']))

            cp = make_simple_element('conference_paper', parent=c)

            # contributors
            contribs = make_simple_element('contributors', parent=cp)
            author_index = 0
            for author in paper.findall('./author'):
                pn = make_simple_element(
                    'person_name',
                    parent=contribs,
                    attrib={
                        'contributor_role': 'author',
                        'sequence':
                        'first' if author_index == 0 else 'additional'
                    })
                author_index += 1

                for name_part in author:
                    if name_part.tag == 'first':
                        gn = make_simple_element('given_name',
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == 'last':
                        sn = make_simple_element('surname',
                                                 text=name_part.text,
                                                 parent=pn)

            for title in paper.iter(tag='title'):
                o_titles = make_simple_element('titles', parent=cp)
                o_title = make_simple_element('title',
                                              parent=o_titles,
                                              text=formatter.as_text(title))

            pd = make_simple_element('publication_date', parent=cp)
            o_year = make_simple_element('year', parent=pd)
            o_year.text = year

            for pages in paper.iter(tag='pages'):
                o_pages = make_simple_element('pages', parent=cp)
                fp = make_simple_element('first_page', parent=o_pages)
                lp = make_simple_element('last_page', parent=o_pages)
                try:
                    fp.text = re.split('[-–]', pages.text)[0]
                    lp.text = re.split('[-–]', pages.text)[1]
                except IndexError as e:  # only one page
                    fp.text = pages.text
                    lp.text = pages.text

            # DOI assignation data
            dd = make_simple_element('doi_data', parent=cp)
            doi = make_simple_element('doi',
                                      parent=dd,
                                      text=DOI_PREFIX + url + aa_id)
            resource = make_simple_element('resource',
                                           parent=dd,
                                           text=ANTHOLOGY_URL.format(url +
                                                                     aa_id))

    print(
        etree.tostring(new_volume,
                       pretty_print=True,
                       encoding='UTF-8',
                       xml_declaration=True,
                       with_tail=True).decode('utf-8'))
Exemplo n.º 19
0
def add_revision(anth_id,
                 pdf_path,
                 explanation,
                 change_type="revision",
                 dry_run=True,
                 date=None):
    """
    Takes an Anthology ID. It then adds a revision to the Anthology XML,
    updating and writing the XML file, and copies the PDFs into place.
    For PDFs, the revised PDF is saved to {anth_id}.pdf and {anth_id}v{version}.pdf.
    For the first revision, we first copy {anth_id}.pdf to {anth_id}v1.pdf.
    """
    if date is None:
        now = datetime.now()
        date = f"{now.year}-{now.month:02d}-{now.day:02d}"

    def maybe_copy(file_from, file_to):
        if not dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    # The new version
    revno = None

    change_letter = "e" if change_type == "erratum" else "v"

    checksum = compute_hash_from_file(pdf_path)

    # Files for old-style IDs are stored under anthology-files/pdf/P/P19/*
    # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/*
    output_dir = get_pdf_dir(anth_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{anth_id}.pdf")

    # Update XML
    xml_file = get_xml_file(anth_id)
    collection_id, volume_id, paper_id = deconstruct_anthology_id(anth_id)
    tree = ET.parse(xml_file)
    if paper_id == "0":
        paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter")
    else:
        paper = tree.getroot().find(
            f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if change_type == "erratum" else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not dry_run:
            # Update the URL hash on the <url> tag
            url = paper.find("./url")
            if url is not None:
                url.attrib["hash"] = checksum

            if change_type == "revision" and revno == 2:
                if paper.find("./url") is not None:
                    current_version_url = infer_url(
                        paper.find("./url").text) + ".pdf"

                # Download original file
                # There are no versioned files the first time around, so create the first one
                # (essentially backing up the original version)
                revised_file_v1_path = os.path.join(
                    output_dir, f"{anth_id}{change_letter}1.pdf")

                retrieve_url(current_version_url, revised_file_v1_path)
                validate_file_type(revised_file_v1_path)

                old_checksum = compute_hash_from_file(revised_file_v1_path)

                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{anth_id}{change_letter}1",
                        "hash": old_checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{anth_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {anth_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    revised_file_versioned_path = os.path.join(
        output_dir, f"{anth_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(pdf_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if change_type == "revision":
        maybe_copy(pdf_path, canonical_path)
Exemplo n.º 20
0
def add_attachment(anthology_id, path, attach_type, overwrite=False):
    """
    Adds a single attachment to the Anthology data files.

    Arguments:
    - The ACL ID of the paper (e.g., P17-1012)
    - The path to the attachment (can be a URL)
    - The attachment type (poster, presentation, note, software)
    - Whether to overwrite the downloaded file.
    """

    collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id)
    paper_extension = path.replace("?dl=1", "").split(".")[-1]

    output_dir = os.path.join(args.attachment_root, collection_id[0],
                              collection_id)
    attachment_file_name = f"{anthology_id}.{attach_type}.{paper_extension}"
    dest_path = os.path.join(output_dir, attachment_file_name)
    if os.path.exists(dest_path) and not overwrite:
        print(
            f"-> target file {dest_path} already in place, refusing to overwrite",
            file=sys.stderr,
        )
        return None

    if path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f"-> Downloading file from {path} to {input_file_path}",
                  file=sys.stderr)
            with urllib.request.urlopen(path) as url, open(
                    input_file_path, mode="wb") as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            raise Exception(f"Could not download {path}")
    else:
        input_file_path = path

    detected = filetype.guess(input_file_path)
    if detected is None or not detected.mime.endswith(detected.extension):
        mime_type = 'UNKNOWN' if detected is None else detected.mime
        raise Exception(
            f"{anthology_id} file {path} has MIME type {mime_type}")

    if paper_extension not in ALLOWED_TYPES:
        raise Exception(
            f"-> Unknown file extension {paper_extension} for {path}")

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)

    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        # Check if attachment already exists
        for attachment in paper.findall("attachment"):
            if attachment.text == attachment_file_name:
                print(
                    f"-> attachment {attachment_file_name} already exists in the XML",
                    file=sys.stderr,
                )
                break
        else:
            attachment = ET.Element("attachment")
            attachment.attrib["type"] = attach_type.lower()
            attachment.text = attachment_file_name

            paper.append(attachment)
            indent(tree.getroot())
            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f"-> added attachment {attachment_file_name} to the XML",
                  file=sys.stderr)

    else:
        raise Exception(f"Paper {anthology_id} not found in the Anthology")

    # Make sure directory exists
    if not os.path.exists(output_dir):
        #        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    # Copy file
    shutil.copy(input_file_path, dest_path)
    os.chmod(dest_path, 0o644)
    print(f"-> copied {input_file_path} to {dest_path} and fixed perms",
          file=sys.stderr)

    # Clean up
    if path.startswith("http"):
        os.remove(input_file_path)

    return dest_path
Exemplo n.º 21
0
def main(args):

    change_type = 'erratum' if args.erratum else 'revision'
    change_letter = 'e' if args.erratum else 'v'

    print(f'Processing {change_type} to {args.anthology_id}...')

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith('http'):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f'-> Downloading file from {args.path}', file=sys.stderr)
            with urllib.request.urlopen(args.path) as url, open(input_file_path, mode='wb') as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            print('An SSL error was encountered in downloading the files.', file=sys.stderr)
            sys.exit(1)
    else:
        input_file_path = args.path

    collection_id, volume_id, paper_id = deconstruct_anthology_id(args.anthology_id)
    paper_extension = args.path.split('.')[-1]

    # The new version
    revno = None

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml')
    tree = ET.parse(xml_file)
    paper = tree.getroot().find(f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib['id']) + 1

        if args.do:
            revision = ET.Element(change_type)
            revision.attrib['id'] = str(revno)
            revision.attrib['href'] = f'{args.anthology_id}{change_letter}{revno}'
            revision.text = args.explanation

            # Set tails to maintain proper indentation
            paper[-1].tail += '  '
            revision.tail = '\n    '  # newline and two levels of indent

            paper.append(revision)

            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr)

    else:
        print(f'-> FATAL: paper ID {args.anthology_id} not found in the Anthology', file=sys.stderr)
        sys.exit(1)

    output_dir = os.path.join(args.anthology_dir, 'pdf', collection_id[0], collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f'-> Creating directory {output_dir}', file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f'{args.anthology_id}.pdf')

    if not args.erratum and revno == 2:
        # There are no versioned files the first time around, so create the first one
        # (essentially backing up the original version)
        revised_file_v1_path = os.path.join(output_dir, f'{args.anthology_id}{change_letter}1.pdf')

        current_version = ANTHOLOGY_PDF.format(args.anthology_id)
        if args.do:
            try:
                print(f'-> Downloading file from {args.path} to {revised_file_v1_path}', file=sys.stderr)
                with urllib.request.urlopen(current_version) as url, open(revised_file_v1_path, mode='wb') as fh:
                    fh.write(url.read())
            except ssl.SSLError:
                print(f'-> FATAL: An SSL error was encountered in downloading {args.path}.', file=sys.stderr)
                sys.exit(1)
        else:
            print(f'-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}', file=sys.stderr)


    revised_file_versioned_path = os.path.join(output_dir, f'{args.anthology_id}{change_letter}{revno}.pdf')

    maybe_copy(input_file_path, revised_file_versioned_path, args.do)
    maybe_copy(input_file_path, canonical_path, args.do)

    if args.path.startswith('http'):
        os.remove(input_file_path)
Exemplo n.º 22
0
from anthology.utils import deconstruct_anthology_id

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("collections", nargs="+")
    args = parser.parse_args()

    anthology = Anthology(
        importdir=os.path.join(os.path.dirname(sys.argv[0]), "..", "data"))

    # header
    print("name", "id", "title", sep="\t")

    for id_, paper in anthology.papers.items():
        collection_id, volume_name, paper_id = deconstruct_anthology_id(id_)
        if collection_id in args.collections:
            authors = paper.attrib.get("author", [])
            if len(authors) > 0:
                # "authors" is a list of ("last name || first name", name-id or None) tuples
                first_author = authors[0][0]
                authors_papers = list(
                    anthology.people.name_to_papers[first_author].values())
                authors_papers = authors_papers[0] + authors_papers[1]
                if len(authors_papers) == 1:
                    print(first_author.full,
                          id_,
                          paper.get_title('text'),
                          sep="\t")
Exemplo n.º 23
0
def main(args):
    def maybe_copy(file_from, file_to):
        if not args.dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    change_type = "erratum" if args.erratum else "revision"
    change_letter = "e" if args.erratum else "v"

    print(f"Processing {change_type} to {args.anthology_id}...")

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        download_file(args.path, input_file_path)
    else:
        input_file_path = args.path

    validate_file_type(input_file_path)

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split(".")[-1]

    # The new version
    revno = None

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Files for old-style IDs are stored under anthology-files/pdf/P/P19/*
    # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/*
    if is_newstyle_id(args.anthology_id):
        venue_name = collection_id.split(".")[1]
        output_dir = os.path.join(args.anthology_dir, "pdf", venue_name)
    else:
        output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0],
                                  collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf")

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    if paper_id == "0":
        paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter")
    else:
        paper = tree.getroot().find(
            f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not args.dry_run:
            # Update the URL hash on the <url> tag
            url = paper.find("./url")
            if url is not None:
                url.attrib["hash"] = checksum

            if not args.erratum and revno == 2:
                if paper.find("./url") is not None:
                    current_version_url = infer_url(
                        paper.find("./url").text) + ".pdf"

                # Download original file
                # There are no versioned files the first time around, so create the first one
                # (essentially backing up the original version)
                revised_file_v1_path = os.path.join(
                    output_dir, f"{args.anthology_id}{change_letter}1.pdf")

                download_file(current_version_url, revised_file_v1_path)
                validate_file_type(revised_file_v1_path)

                with open(revised_file_v1_path, "rb") as f:
                    old_checksum = compute_hash(f.read())

                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{args.anthology_id}{change_letter}1",
                        "hash": old_checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                args.explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{args.anthology_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": args.date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    revised_file_versioned_path = os.path.join(
        output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(input_file_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if not args.erratum:
        maybe_copy(input_file_path, canonical_path)

    if args.path.startswith("http"):
        os.remove(input_file_path)
Exemplo n.º 24
0
def add_attachment(anthology_id, path, attach_type, overwrite=False):
    """
    Adds a single attachment to the Anthology data files.

    Arguments:
    - The ACL ID of the paper (e.g., P17-1012)
    - The path to the attachment (can be a URL)
    - The attachment type (poster, presentation, note, software)
    - Whether to overwrite the downloaded file.
    """

    collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id)

    if path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f"-> Downloading file from {path} to {input_file_path}",
                  file=sys.stderr)
            request = urllib.request.Request(
                path, headers={'User-Agent': 'Mozilla/5.0'})
            with urllib.request.urlopen(request) as url, open(
                    input_file_path, mode="wb") as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            raise Exception(f"Could not download {path}")
        except Exception as e:
            raise e
    else:
        input_file_path = path

    file_extension = path.replace("?dl=1", "").split(".")[-1]
    # Many links from file sharing services are not informative and don't have
    # extensions, so we could try to guess.
    if file_extension not in ALLOWED_TYPES:
        detected = filetype.guess(input_file_path)
        if detected is not None:
            file_extension = detected.mime.split("/")[-1]
            if file_extension not in ALLOWED_TYPES:
                print(
                    f"Could not determine file extension for {anthology_id} at {path}",
                    file=sys.stderr,
                )

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)

    attachment_file_name = f"{anthology_id}.{attach_type}.{file_extension}"

    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        # Check if attachment already exists
        for attachment in paper.findall("attachment"):
            if attachment.text == attachment_file_name:
                print(
                    f"-> attachment {attachment_file_name} already exists in the XML",
                    file=sys.stderr,
                )
                break
        else:
            attachment = ET.Element("attachment")
            attachment.attrib["type"] = attach_type.lower()
            attachment.attrib["hash"] = checksum
            attachment.text = attachment_file_name

            paper.append(attachment)
            indent(tree.getroot())
            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f"-> added attachment {attachment_file_name} to the XML",
                  file=sys.stderr)

    else:
        print(f"Paper {anthology_id} not found in the Anthology",
              file=sys.stderr)

    # Make sure directory exists
    output_dir = os.path.join(args.attachment_root, collection_id[0],
                              collection_id)
    if not os.path.exists(output_dir):
        #        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    # Copy file
    dest_path = os.path.join(output_dir, attachment_file_name)
    if os.path.exists(dest_path) and not overwrite:
        print(
            f"-> target file {dest_path} already in place, refusing to overwrite",
            file=sys.stderr,
        )
        return None

    shutil.copy(input_file_path, dest_path)
    os.chmod(dest_path, 0o644)
    print(f"-> copied {input_file_path} to {dest_path} and fixed perms",
          file=sys.stderr)

    # Clean up
    if path.startswith("http"):
        os.remove(input_file_path)

    return dest_path