def upload_file(filepath: str): """ Uploads regular PDFs or attachments to their correct place on the aclweb server. """ relative_dest_path = '' os.chmod(filepath, 0o644) filename = os.path.basename(filepath) fileparts = filename.split('.') if len(fileparts) == 2: # e.g., P19-1001.pdf collection_id, volume_id, _ = deconstruct_anthology_id(fileparts[0]) collection = collection_id[0] relative_dest_path = f'pdf/{collection}/{collection_id}/{filename}' elif len(fileparts) == 3: # e.g., P19-1001.Attachment.pdf collection_id, volume_id, _ = deconstruct_anthology_id(fileparts[0]) relative_dest_path = f'attachments/{collection}/{collection_id}/{filename}' command = f'scp -q {filepath} aclweb:{ACLWEB_FILE_ROOT}/{relative_dest_path}' attempts = 1 retcode = 1 while attempts <= 3 and retcode != 0: # This fails sometimes for no reason, so try a couple of times retcode = subprocess.call(command, shell=True) if attempts > 1: print(f'-> Failed for some reason, attempt #{attempts}', file=sys.stderr) print(f'{command} -> {retcode}', file=sys.stderr) attempts += 1
def process_volume(anthology_volume): collection_id, volume_id, _ = deconstruct_anthology_id(anthology_volume) print(f'Attempting to add DOIs for {anthology_volume}', file=sys.stderr) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = ET.parse(xml_file) formatter = MarkupFormatter() num_added = 0 volume = tree.getroot().find(f"./volume[@id='{volume_id}']") if volume is not None: volume_booktitle = volume.find(f"./meta/booktitle") volume_title = formatter.as_text(volume_booktitle) print(f'-> found existing volume "{volume_title}"', file=sys.stderr) # Iterate through all papers for paper in chain(volume.find('frontmatter'), volume.findall('paper')): added = add_doi(paper, collection_id, volume_id, force=args.force) if added: num_added += 1 sleep(1) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> added {num_added} DOIs to to the XML for collection {collection_id}', file=sys.stderr) else: print(f'-> FATAL: volume {volume} not found in the Anthology', file=sys.stderr) sys.exit(1)
def process_volume(anthology_volume): collection_id, volume_id, _ = deconstruct_anthology_id(anthology_volume) if is_newstyle_id(anthology_volume): venue_path = collection_id.split(".")[1] else: venue_path = os.path.join(collection_id[0], collection_id) print(f"Downloading PDFs for {anthology_volume}", file=sys.stderr) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) for paper in tree.getroot().findall(f".//paper"): anthid = paper.find("./url").text # Try to get the URL from the Anthology if not test_url_code(infer_url(anthid)): doi = paper.find("./doi").text doi_pdf = f"https://www.mitpressjournals.org/doi/pdf/{doi}" local_path = os.path.join(args.anthology_files_dir, venue_path, f"{anthid}.pdf") if not os.path.exists(os.path.dirname(local_path)): os.makedirs(os.path.dirname(local_path)) retrieve_url(doi_pdf, local_path) print(f"Saved {doi_pdf} to {local_path}") sleep(1)
def main(args): print(f"Adding {args.award} to {args.anthology_id}...") collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is None: print(f"Error: Can't find paper {args.anthology_id}, quitting") existing_award = paper.find("./award") if existing_award is not None and award.text.lower() == args.award: print( f"Error: Award {args.award} already exists for {args.anthology_id}, quitting" ) make_simple_element("award", args.award, parent=paper) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def volume_sorter(volume_tuple): """ Extracts the year so that we can sort by the year and then the collection ID. """ volume_id = volume_tuple[0] collection_id, year, _ = deconstruct_anthology_id(volume_id) year = infer_year(collection_id) return year, volume_id
def remove_volume(self, full_volume_id): """ Volumes with future ingestion dates are not built and may need to be removed from a SIG's listing. `full_volume_id` looks like `P19-1` or `W19-31` """ collection_id, _, _ = deconstruct_anthology_id(full_volume_id) year = int(infer_year(collection_id)) if year in self.events_by_year: self.events_by_year[year] = [event for event in self.events_by_year[year] if not event[0] == full_volume_id]
def add_video_tag(anth_paper, xml_parse): coll_id, vol_id, paper_id = deconstruct_anthology_id(anth_paper.anthology_id) paper = xml_parse.find(f'./volume[@id="{vol_id}"]/paper[@id="{paper_id}"]') if anth_paper.presentation_id.startswith("http"): video_url = anth_paper.presentation_id else: video_url = "https://slideslive.com/{}".format(anth_paper.presentation_id) make_simple_element("video", attrib={"tag": "video", "href": video_url}, parent=paper)
def main(args): for lineno, line in enumerate(sys.stdin, 1): # attachments/D/D15/D15-1272.Attachment.pdf tokens = line.rstrip().split("/") attachment_file_name = tokens[-1] try: anth_id, kind, *rest = attachment_file_name.split(".") except: print(f"Couldn't parse file {attachment_file_name} into 3 pieces") continue try: collection_id, volume_id, paper_id = deconstruct_anthology_id( anth_id) except: print(f"[{lineno}] BAD LINE {line.rstrip()}") # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) if int(paper_id) == 0: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: # Check if attachment already exists for attachment in paper.findall("attachment"): if attachment.text == attachment_file_name: # print(f'-> attachment {attachment_file_name} already exists in the XML', file=sys.stderr) break else: attachment = ET.Element("attachment") attachment.attrib["type"] = kind.lower() attachment.text = attachment_file_name paper.append(attachment) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print( f"-> [{lineno}] added attachment {attachment_file_name} to the XML", file=sys.stderr, ) else: print( f"-> FATAL: [{lineno}] paper ({anth_id}) not found in the Anthology", file=sys.stderr, ) sys.exit(1)
def __init__(self, acronym, letter, anth_id): self.parent_venue = acronym.lower() self.anth_id = anth_id collection_id, self.volume_id, _ = deconstruct_anthology_id(anth_id) if is_newstyle_id(collection_id): self.venue = collection_id.split(".")[1] self.is_parent_venue = self.venue == self.parent_venue else: self.venue = collection_id[0] self.is_parent_venue = self.venue == letter
def get_dest_path(filepath: str): """ Returns the destination path on the remote server for the file. """ dest_path = "" filename = os.path.basename(filepath) fileparts = filename.split(".") if len(fileparts) == 2: # e.g., P19-1001.pdf collection_id, volume_id, _ = deconstruct_anthology_id(fileparts[0]) collection = collection_id[0] dest_path = f"pdf/{collection}/{collection_id}" elif len(fileparts) == 3: # e.g., P19-1001.Attachment.pdf collection_id, volume_id, _ = deconstruct_anthology_id(fileparts[0]) collection = collection_id[0] dest_path = f"attachments/{collection}/{collection_id}" else: raise Exception(f"Can't determine target destination from {filepath}") return f"{ANTHOLOGY_HOST}:{ANTHOLOGY_FILE_ROOT}/{dest_path}"
def main(args): """ Downloads an Anthology paper and adds a RETRACTED watermark, then updates the XML with an appropriate <revision> and <retracted> tag. """ with tempfile.TemporaryDirectory() as tempdir: new_pdf = add_watermark(args.anthology_id, workdir=tempdir) add_revision( args.anthology_id, new_pdf, explanation="Retracted.", change_type="revision", dry_run=False, ) xml_file = get_xml_file(args.anthology_id) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find( f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is None: print(f"Couldn't find paper {args.anthology_id}!", file=sys.stderr) sys.exit(2) print("Modifying the XML", file=sys.stderr) now = datetime.now() date = f"{now.year}-{now.month:02d}-{now.day:02d}" retracted_node = make_simple_element("retracted", args.explanation, attrib={"date": date}, parent=paper) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def split_anth_id(anth_id): coll_id, _, _ = deconstruct_anthology_id(anth_id) return coll_id
def bib2xml(bibfilename, anthology_id): """ Moved here from ACLPUB's anthology_xml.py script. """ fields = [ 'title', 'author', 'editor', 'booktitle', 'month', 'year', 'address', 'publisher', 'pages', 'abstract', 'url', 'doi', 'language', ] collection_id, volume_name, paper_no = deconstruct_anthology_id( anthology_id) if paper_no == '': return # skip the master bib file; we only process the individual files bibdata = read_bibtex(bibfilename) if len(bibdata.entries) != 1: log(f"more than one entry in {bibfilename}") bibkey, bibentry = bibdata.entries.items()[0] if len(bibentry.fields) == 0: log(f"parsing bib of paper {paper_no} failed") sys.exit(1) paper = make_simple_element("paper", attrib={"id": paper_no}) for field in list(bibentry.fields) + list(bibentry.persons): if field not in fields: log(f"unknown field {field}") for field in fields: if field in ['author', 'editor']: if field in bibentry.persons: for person in bibentry.persons[field]: first_text = ' '.join(person.bibtex_first_names) last_text = ' '.join(person.prelast_names + person.last_names) if person.lineage_names: last_text += ', ' + ' '.join(person.lineage_names) # Don't distinguish between authors that have only a first name # vs. authors that have only a last name; always make it a last name. if last_text.strip() in [ '', '-', ]: # Some START users have '-' for null last_text = first_text first_text = '' name_node = make_simple_element(field, parent=paper) make_simple_element("first", first_text, parent=name_node) make_simple_element("last", last_text, parent=name_node) else: if field == 'url': value = f"{anthology_id}" elif field in bibentry.fields: value = bibentry.fields[field] elif field == 'bibtype': value = bibentry.type elif field == 'bibkey': value = bibkey else: continue make_simple_element(field, text=value, parent=paper) return paper
def add_volume(self, full_volume_id): collection_id, _, _ = deconstruct_anthology_id(full_volume_id) year = int(infer_year(collection_id)) if not year in self.events_by_year: self.events_by_year[year] = [] self.events_by_year[year].append(full_volume_id)
def main(volumes): formatter = MarkupFormatter() ## Assemble container doi_batch = make_simple_element( "doi_batch", attrib={ "xmlns": "http://www.crossref.org/schema/4.4.1", "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation": "http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd", "version": "4.4.1", }, namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"}, ) new_volume = etree.ElementTree(doi_batch) ## Assemble head head = make_simple_element("head", parent=new_volume.getroot()) dbi = make_simple_element("doi_batch_id", text=str(int(time.time())), parent=head) timestamp = make_simple_element("timestamp", text=str(int(time.time())), parent=head) depositor = make_simple_element("depositor", parent=head) depositor_name = make_simple_element("depositor_name", text=DEPOSITOR_NAME, parent=depositor) email_address = make_simple_element("email_address", text=EMAIL_ADDRESS, parent=depositor) registrant = make_simple_element("registrant", text=REGISTRANT, parent=head) ## Assemble body body = make_simple_element("body", parent=new_volume.getroot()) year = "" start_month = "" end_month = "" for full_volume_id in sorted(volumes): collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id) collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = etree.parse(collection_file) v = tree.getroot().find(f"./volume[@id='{volume_id}']") if v is None: print(f"* Can't find volume {full_volume_id}", file=sys.stderr) continue ## Assemble frontmatter c = make_simple_element("conference", parent=body) contribs = make_simple_element("contributors", parent=c) editor_index = 0 meta = v.find("./meta") for tag in meta: if tag.tag == "year": year = tag.text elif tag.tag == "month": month = tag.text try: start_month = MONTH_HASH[re.split("[-–]", month)[0]] end_month = MONTH_HASH[re.split("[-–]", month)[1]] except IndexError as e: # only one month start_month = MONTH_HASH[month] end_month = MONTH_HASH[month] except Exception as e: print( f"FATAL: can't parse month {month} in {full_volume_id}", file=sys.stderr, ) sys.exit(1) elif tag.tag == "url": url = tag.text elif tag.tag == "booktitle": booktitle = formatter.as_text(tag) elif tag.tag == "address": address = tag.text elif tag.tag == "publisher": publisher = tag.text elif tag.tag == "editor": pn = make_simple_element( "person_name", parent=contribs, attrib={ "contributor_role": "chair", "sequence": "first" if editor_index == 0 else "additional", }, ) editor_index += 1 for name_part in tag: if name_part.tag == "first": gn = make_simple_element("given_name", parent=pn, text=name_part.text) elif name_part.tag == "last": sn = make_simple_element("surname", text=name_part.text, parent=pn) # Assemble Event Metadata em = make_simple_element("event_metadata", parent=c) cn = make_simple_element("conference_name", parent=em, text=booktitle) cl = make_simple_element("conference_location", parent=em, text=address) cd = make_simple_element( "conference_date", parent=em, attrib={ "start_year": year, "end_year": year, "start_month": start_month, "end_month": end_month, }, ) # Assemble Proceedings Metadata pm = make_simple_element("proceedings_metadata", parent=c, attrib={"language": "en"}) pt = make_simple_element("proceedings_title", parent=pm, text=booktitle) p = make_simple_element("publisher", parent=pm) pn = make_simple_element("publisher_name", parent=p, text=publisher) pp = make_simple_element("publisher_place", parent=p, text=PUBLISHER_PLACE) pd = make_simple_element("publication_date", parent=pm) y = make_simple_element("year", parent=pd, text=year) noisbn = make_simple_element("noisbn", parent=pm, attrib={"reason": "simple_series"}) # DOI assignation data dd = make_simple_element("doi_data", parent=pm) doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url) resource = make_simple_element("resource", parent=dd, text=ANTHOLOGY_URL.format(url)) for paper in v.findall("./paper"): ## Individual Paper Data # TODO: this is not future-proof, should use anthology.util library functions aa_id = "" if len(url) == 6: aa_id = "{:02d}".format(int(paper.attrib["id"])) else: if len(url) == 5: aa_id = "{:03d}".format(int(paper.attrib["id"])) cp = make_simple_element("conference_paper", parent=c) # contributors contribs = make_simple_element("contributors", parent=cp) author_index = 0 for author in paper.findall("./author"): pn = make_simple_element( "person_name", parent=contribs, attrib={ "contributor_role": "author", "sequence": "first" if author_index == 0 else "additional", }, ) author_index += 1 for name_part in author: if name_part.tag == "first": gn = make_simple_element("given_name", parent=pn, text=name_part.text) elif name_part.tag == "last": sn = make_simple_element("surname", text=name_part.text, parent=pn) for title in paper.iter(tag="title"): o_titles = make_simple_element("titles", parent=cp) o_title = make_simple_element("title", parent=o_titles, text=formatter.as_text(title)) pd = make_simple_element("publication_date", parent=cp) o_year = make_simple_element("year", parent=pd) o_year.text = year for pages in paper.iter(tag="pages"): o_pages = make_simple_element("pages", parent=cp) fp = make_simple_element("first_page", parent=o_pages) lp = make_simple_element("last_page", parent=o_pages) try: fp.text = re.split("[-–]", pages.text)[0] lp.text = re.split("[-–]", pages.text)[1] except IndexError as e: # only one page fp.text = pages.text lp.text = pages.text # DOI assignation data dd = make_simple_element("doi_data", parent=cp) doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url + aa_id) resource = make_simple_element("resource", parent=dd, text=ANTHOLOGY_URL.format(url + aa_id)) print( etree.tostring( new_volume, pretty_print=True, encoding="UTF-8", xml_declaration=True, with_tail=True, ).decode("utf-8"))
def main(args): def maybe_copy(file_from, file_to): if not args.dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) change_type = "erratum" if args.erratum else "revision" change_letter = "e" if args.erratum else "v" print(f"Processing {change_type} to {args.anthology_id}...") # TODO: make sure path exists, or download URL to temp file if args.path.startswith("http"): _, input_file_path = tempfile.mkstemp() try: print(f"-> Downloading file from {args.path}", file=sys.stderr) with urllib.request.urlopen(args.path) as url, open( input_file_path, mode="wb") as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: print("An SSL error was encountered in downloading the files.", file=sys.stderr) sys.exit(1) else: input_file_path = args.path detected = filetype.guess(input_file_path) if detected is None or not detected.mime.endswith(detected.extension): mime_type = 'UNKNOWN' if detected is None else detected.mime print( f"FATAL: {args.anthology_id} file {args.path} has MIME type {mime_type}", file=sys.stderr, ) sys.exit(1) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split(".")[-1] # The new version revno = None with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not args.dry_run: if not args.erratum and revno == 2: # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{args.anthology_id}{change_letter}1", "hash": checksum, }, parent=paper, ) revision = make_simple_element( change_type, args.explanation, attrib={ "id": str(revno), "href": f"{args.anthology_id}{change_letter}{revno}", "hash": checksum, "date": args.date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf") if not args.erratum and revno == 2: # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}1.pdf") current_version = ANTHOLOGY_PDF.format(args.anthology_id) if not args.dry_run: try: print( f"-> Downloading file from {args.path} to {revised_file_v1_path}", file=sys.stderr, ) with urllib.request.urlopen(current_version) as url, open( revised_file_v1_path, mode="wb") as fh: fh.write(url.read()) except ssl.SSLError: print( f"-> FATAL: An SSL error was encountered in downloading {args.path}.", file=sys.stderr, ) sys.exit(1) else: print( f"-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}", file=sys.stderr, ) revised_file_versioned_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(input_file_path, revised_file_versioned_path) # Copy it over the canonical path if not args.erratum: maybe_copy(input_file_path, canonical_path) if args.path.startswith("http"): os.remove(input_file_path)
def main(args): print(f'Processing attachment for {args.anthology_id}', file=sys.stderr) if args.path.startswith('http'): _, input_file_path = tempfile.mkstemp() try: print('-> Downloading file from {}'.format(args.path), file=sys.stderr) with urllib.request.urlopen(args.path) as url, open( input_file_path, mode='wb') as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: print( '-> FATAL: An SSL error was encountered in downloading the files.', file=sys.stderr) sys.exit(1) else: input_file_path = args.path collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split('.')[-1] if paper_extension not in ALLOWED_TYPES: print( f'-> FATAL: {args.anthology_id} unknown file extension {paper_extension}', file=sys.stderr) sys.exit(1) attachment_file_name = f'{args.anthology_id}.{args.type}.{paper_extension}' # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: # Check if attachment already exists for attachment in paper.findall('attachment'): if attachment.text == attachment_file_name: print( f'-> attachment {attachment_file_name} already exists in the XML', file=sys.stderr) break else: attachment = ET.Element('attachment') attachment.attrib['type'] = args.type.lower() attachment.text = attachment_file_name paper.append(attachment) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> added attachment {attachment_file_name} to the XML', file=sys.stderr) else: print( f'-> FATAL: paper (volume={volume_id}, paper={paper_id}) not found in the Anthology', file=sys.stderr) sys.exit(1) # Make sure directory exists output_dir = os.path.join(args.attachment_root, collection_id[0], collection_id) if not os.path.exists(output_dir): print(f'-> Creating directory {output_dir}', file=sys.stderr) os.makedirs(output_dir) # Copy file dest_path = os.path.join(output_dir, attachment_file_name) if os.path.exists(dest_path): print( f'-> target file {dest_path} already in place, refusing to overwrite', file=sys.stderr) else: shutil.copy(input_file_path, dest_path) os.chmod(dest_path, 0o644) print(f'-> copied {input_file_path} to {dest_path} and fixed perms', file=sys.stderr) # Clean up if args.path.startswith('http'): os.remove(input_file_path)
def main(volumes): formatter = MarkupFormatter() ## Assemble container doi_batch = make_simple_element( 'doi_batch', attrib={ 'xmlns': 'http://www.crossref.org/schema/4.4.1', '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd', 'version': '4.4.1' }, namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}) new_volume = etree.ElementTree(doi_batch) ## Assemble head head = make_simple_element('head', parent=new_volume.getroot()) dbi = make_simple_element('doi_batch_id', text=str(int(time.time())), parent=head) timestamp = make_simple_element('timestamp', text=str(int(time.time())), parent=head) depositor = make_simple_element('depositor', parent=head) depositor_name = make_simple_element('depositor_name', text=DEPOSITOR_NAME, parent=depositor) email_address = make_simple_element('email_address', text=EMAIL_ADDRESS, parent=depositor) registrant = make_simple_element('registrant', text=REGISTRANT, parent=head) ## Assemble body body = make_simple_element('body', parent=new_volume.getroot()) year = "" start_month = "" end_month = "" for full_volume_id in sorted(volumes): collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id) collection_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = etree.parse(collection_file) v = tree.getroot().find(f"./volume[@id='{volume_id}']") if v is None: print(f"* Can't find volume {full_volume_id}", file=sys.stderr) continue ## Assemble frontmatter c = make_simple_element('conference', parent=body) contribs = make_simple_element('contributors', parent=c) editor_index = 0 meta = v.find('./meta') for tag in meta: if tag.tag == 'year': year = tag.text elif tag.tag == 'month': month = tag.text try: start_month = MONTH_HASH[re.split('[-–]', month)[0]] end_month = MONTH_HASH[re.split('[-–]', month)[1]] except IndexError as e: # only one month start_month = MONTH_HASH[month] end_month = MONTH_HASH[month] elif tag.tag == 'url': url = tag.text elif tag.tag == 'booktitle': booktitle = tag.text elif tag.tag == 'address': address = tag.text elif tag.tag == 'publisher': publisher = tag.text elif tag.tag == 'editor': pn = make_simple_element( 'person_name', parent=contribs, attrib={ 'contributor_role': 'chair', 'sequence': 'first' if editor_index == 0 else 'additional' }) editor_index += 1 for name_part in tag: if name_part.tag == 'first': gn = make_simple_element('given_name', parent=pn, text=name_part.text) elif name_part.tag == 'last': sn = make_simple_element('surname', text=name_part.text, parent=pn) # Assemble Event Metadata em = make_simple_element('event_metadata', parent=c) cn = make_simple_element('conference_name', parent=em, text=booktitle) cl = make_simple_element('conference_location', parent=em, text=address) cd = make_simple_element('conference_date', parent=em, attrib={ 'start_year': year, 'end_year': year, 'start_month': start_month, 'end_month': end_month }) # Assemble Proceedings Metadata pm = make_simple_element('proceedings_metadata', parent=c, attrib={'language': 'en'}) pt = make_simple_element('proceedings_title', parent=pm, text=booktitle) p = make_simple_element('publisher', parent=pm) pn = make_simple_element('publisher_name', parent=p, text=publisher) pp = make_simple_element('publisher_place', parent=p, text=PUBLISHER_PLACE) pd = make_simple_element('publication_date', parent=pm) y = make_simple_element('year', parent=pd, text=year) noisbn = make_simple_element('noisbn', parent=pm, attrib={'reason': 'simple_series'}) # DOI assignation data dd = make_simple_element('doi_data', parent=pm) doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url) resource = make_simple_element('resource', parent=dd, text=ANTHOLOGY_URL.format(url)) for paper in v.findall('./paper'): ## Individual Paper Data # TODO: this is not future-proof, should use anthology.util library functions aa_id = "" if (len(url) == 6): aa_id = '{:02d}'.format(int(paper.attrib['id'])) else: if (len(url) == 5): aa_id = '{:03d}'.format(int(paper.attrib['id'])) cp = make_simple_element('conference_paper', parent=c) # contributors contribs = make_simple_element('contributors', parent=cp) author_index = 0 for author in paper.findall('./author'): pn = make_simple_element( 'person_name', parent=contribs, attrib={ 'contributor_role': 'author', 'sequence': 'first' if author_index == 0 else 'additional' }) author_index += 1 for name_part in author: if name_part.tag == 'first': gn = make_simple_element('given_name', parent=pn, text=name_part.text) elif name_part.tag == 'last': sn = make_simple_element('surname', text=name_part.text, parent=pn) for title in paper.iter(tag='title'): o_titles = make_simple_element('titles', parent=cp) o_title = make_simple_element('title', parent=o_titles, text=formatter.as_text(title)) pd = make_simple_element('publication_date', parent=cp) o_year = make_simple_element('year', parent=pd) o_year.text = year for pages in paper.iter(tag='pages'): o_pages = make_simple_element('pages', parent=cp) fp = make_simple_element('first_page', parent=o_pages) lp = make_simple_element('last_page', parent=o_pages) try: fp.text = re.split('[-–]', pages.text)[0] lp.text = re.split('[-–]', pages.text)[1] except IndexError as e: # only one page fp.text = pages.text lp.text = pages.text # DOI assignation data dd = make_simple_element('doi_data', parent=cp) doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url + aa_id) resource = make_simple_element('resource', parent=dd, text=ANTHOLOGY_URL.format(url + aa_id)) print( etree.tostring(new_volume, pretty_print=True, encoding='UTF-8', xml_declaration=True, with_tail=True).decode('utf-8'))
def add_revision(anth_id, pdf_path, explanation, change_type="revision", dry_run=True, date=None): """ Takes an Anthology ID. It then adds a revision to the Anthology XML, updating and writing the XML file, and copies the PDFs into place. For PDFs, the revised PDF is saved to {anth_id}.pdf and {anth_id}v{version}.pdf. For the first revision, we first copy {anth_id}.pdf to {anth_id}v1.pdf. """ if date is None: now = datetime.now() date = f"{now.year}-{now.month:02d}-{now.day:02d}" def maybe_copy(file_from, file_to): if not dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) # The new version revno = None change_letter = "e" if change_type == "erratum" else "v" checksum = compute_hash_from_file(pdf_path) # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* output_dir = get_pdf_dir(anth_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{anth_id}.pdf") # Update XML xml_file = get_xml_file(anth_id) collection_id, volume_id, paper_id = deconstruct_anthology_id(anth_id) tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if change_type == "erratum" else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not dry_run: # Update the URL hash on the <url> tag url = paper.find("./url") if url is not None: url.attrib["hash"] = checksum if change_type == "revision" and revno == 2: if paper.find("./url") is not None: current_version_url = infer_url( paper.find("./url").text) + ".pdf" # Download original file # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{anth_id}{change_letter}1.pdf") retrieve_url(current_version_url, revised_file_v1_path) validate_file_type(revised_file_v1_path) old_checksum = compute_hash_from_file(revised_file_v1_path) # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{anth_id}{change_letter}1", "hash": old_checksum, }, parent=paper, ) revision = make_simple_element( change_type, explanation, attrib={ "id": str(revno), "href": f"{anth_id}{change_letter}{revno}", "hash": checksum, "date": date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {anth_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) revised_file_versioned_path = os.path.join( output_dir, f"{anth_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(pdf_path, revised_file_versioned_path) # Copy it over the canonical path if change_type == "revision": maybe_copy(pdf_path, canonical_path)
def add_attachment(anthology_id, path, attach_type, overwrite=False): """ Adds a single attachment to the Anthology data files. Arguments: - The ACL ID of the paper (e.g., P17-1012) - The path to the attachment (can be a URL) - The attachment type (poster, presentation, note, software) - Whether to overwrite the downloaded file. """ collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id) paper_extension = path.replace("?dl=1", "").split(".")[-1] output_dir = os.path.join(args.attachment_root, collection_id[0], collection_id) attachment_file_name = f"{anthology_id}.{attach_type}.{paper_extension}" dest_path = os.path.join(output_dir, attachment_file_name) if os.path.exists(dest_path) and not overwrite: print( f"-> target file {dest_path} already in place, refusing to overwrite", file=sys.stderr, ) return None if path.startswith("http"): _, input_file_path = tempfile.mkstemp() try: print(f"-> Downloading file from {path} to {input_file_path}", file=sys.stderr) with urllib.request.urlopen(path) as url, open( input_file_path, mode="wb") as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: raise Exception(f"Could not download {path}") else: input_file_path = path detected = filetype.guess(input_file_path) if detected is None or not detected.mime.endswith(detected.extension): mime_type = 'UNKNOWN' if detected is None else detected.mime raise Exception( f"{anthology_id} file {path} has MIME type {mime_type}") if paper_extension not in ALLOWED_TYPES: raise Exception( f"-> Unknown file extension {paper_extension} for {path}") # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: # Check if attachment already exists for attachment in paper.findall("attachment"): if attachment.text == attachment_file_name: print( f"-> attachment {attachment_file_name} already exists in the XML", file=sys.stderr, ) break else: attachment = ET.Element("attachment") attachment.attrib["type"] = attach_type.lower() attachment.text = attachment_file_name paper.append(attachment) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f"-> added attachment {attachment_file_name} to the XML", file=sys.stderr) else: raise Exception(f"Paper {anthology_id} not found in the Anthology") # Make sure directory exists if not os.path.exists(output_dir): # print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) # Copy file shutil.copy(input_file_path, dest_path) os.chmod(dest_path, 0o644) print(f"-> copied {input_file_path} to {dest_path} and fixed perms", file=sys.stderr) # Clean up if path.startswith("http"): os.remove(input_file_path) return dest_path
def main(args): change_type = 'erratum' if args.erratum else 'revision' change_letter = 'e' if args.erratum else 'v' print(f'Processing {change_type} to {args.anthology_id}...') # TODO: make sure path exists, or download URL to temp file if args.path.startswith('http'): _, input_file_path = tempfile.mkstemp() try: print(f'-> Downloading file from {args.path}', file=sys.stderr) with urllib.request.urlopen(args.path) as url, open(input_file_path, mode='wb') as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: print('An SSL error was encountered in downloading the files.', file=sys.stderr) sys.exit(1) else: input_file_path = args.path collection_id, volume_id, paper_id = deconstruct_anthology_id(args.anthology_id) paper_extension = args.path.split('.')[-1] # The new version revno = None # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = ET.parse(xml_file) paper = tree.getroot().find(f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib['id']) + 1 if args.do: revision = ET.Element(change_type) revision.attrib['id'] = str(revno) revision.attrib['href'] = f'{args.anthology_id}{change_letter}{revno}' revision.text = args.explanation # Set tails to maintain proper indentation paper[-1].tail += ' ' revision.tail = '\n ' # newline and two levels of indent paper.append(revision) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print(f'-> FATAL: paper ID {args.anthology_id} not found in the Anthology', file=sys.stderr) sys.exit(1) output_dir = os.path.join(args.anthology_dir, 'pdf', collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f'-> Creating directory {output_dir}', file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f'{args.anthology_id}.pdf') if not args.erratum and revno == 2: # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join(output_dir, f'{args.anthology_id}{change_letter}1.pdf') current_version = ANTHOLOGY_PDF.format(args.anthology_id) if args.do: try: print(f'-> Downloading file from {args.path} to {revised_file_v1_path}', file=sys.stderr) with urllib.request.urlopen(current_version) as url, open(revised_file_v1_path, mode='wb') as fh: fh.write(url.read()) except ssl.SSLError: print(f'-> FATAL: An SSL error was encountered in downloading {args.path}.', file=sys.stderr) sys.exit(1) else: print(f'-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}', file=sys.stderr) revised_file_versioned_path = os.path.join(output_dir, f'{args.anthology_id}{change_letter}{revno}.pdf') maybe_copy(input_file_path, revised_file_versioned_path, args.do) maybe_copy(input_file_path, canonical_path, args.do) if args.path.startswith('http'): os.remove(input_file_path)
from anthology.utils import deconstruct_anthology_id if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("collections", nargs="+") args = parser.parse_args() anthology = Anthology( importdir=os.path.join(os.path.dirname(sys.argv[0]), "..", "data")) # header print("name", "id", "title", sep="\t") for id_, paper in anthology.papers.items(): collection_id, volume_name, paper_id = deconstruct_anthology_id(id_) if collection_id in args.collections: authors = paper.attrib.get("author", []) if len(authors) > 0: # "authors" is a list of ("last name || first name", name-id or None) tuples first_author = authors[0][0] authors_papers = list( anthology.people.name_to_papers[first_author].values()) authors_papers = authors_papers[0] + authors_papers[1] if len(authors_papers) == 1: print(first_author.full, id_, paper.get_title('text'), sep="\t")
def main(args): def maybe_copy(file_from, file_to): if not args.dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) change_type = "erratum" if args.erratum else "revision" change_letter = "e" if args.erratum else "v" print(f"Processing {change_type} to {args.anthology_id}...") # TODO: make sure path exists, or download URL to temp file if args.path.startswith("http"): _, input_file_path = tempfile.mkstemp() download_file(args.path, input_file_path) else: input_file_path = args.path validate_file_type(input_file_path) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split(".")[-1] # The new version revno = None with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* if is_newstyle_id(args.anthology_id): venue_name = collection_id.split(".")[1] output_dir = os.path.join(args.anthology_dir, "pdf", venue_name) else: output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf") # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not args.dry_run: # Update the URL hash on the <url> tag url = paper.find("./url") if url is not None: url.attrib["hash"] = checksum if not args.erratum and revno == 2: if paper.find("./url") is not None: current_version_url = infer_url( paper.find("./url").text) + ".pdf" # Download original file # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}1.pdf") download_file(current_version_url, revised_file_v1_path) validate_file_type(revised_file_v1_path) with open(revised_file_v1_path, "rb") as f: old_checksum = compute_hash(f.read()) # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{args.anthology_id}{change_letter}1", "hash": old_checksum, }, parent=paper, ) revision = make_simple_element( change_type, args.explanation, attrib={ "id": str(revno), "href": f"{args.anthology_id}{change_letter}{revno}", "hash": checksum, "date": args.date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) revised_file_versioned_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(input_file_path, revised_file_versioned_path) # Copy it over the canonical path if not args.erratum: maybe_copy(input_file_path, canonical_path) if args.path.startswith("http"): os.remove(input_file_path)
def add_attachment(anthology_id, path, attach_type, overwrite=False): """ Adds a single attachment to the Anthology data files. Arguments: - The ACL ID of the paper (e.g., P17-1012) - The path to the attachment (can be a URL) - The attachment type (poster, presentation, note, software) - Whether to overwrite the downloaded file. """ collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id) if path.startswith("http"): _, input_file_path = tempfile.mkstemp() try: print(f"-> Downloading file from {path} to {input_file_path}", file=sys.stderr) request = urllib.request.Request( path, headers={'User-Agent': 'Mozilla/5.0'}) with urllib.request.urlopen(request) as url, open( input_file_path, mode="wb") as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: raise Exception(f"Could not download {path}") except Exception as e: raise e else: input_file_path = path file_extension = path.replace("?dl=1", "").split(".")[-1] # Many links from file sharing services are not informative and don't have # extensions, so we could try to guess. if file_extension not in ALLOWED_TYPES: detected = filetype.guess(input_file_path) if detected is not None: file_extension = detected.mime.split("/")[-1] if file_extension not in ALLOWED_TYPES: print( f"Could not determine file extension for {anthology_id} at {path}", file=sys.stderr, ) with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) attachment_file_name = f"{anthology_id}.{attach_type}.{file_extension}" paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: # Check if attachment already exists for attachment in paper.findall("attachment"): if attachment.text == attachment_file_name: print( f"-> attachment {attachment_file_name} already exists in the XML", file=sys.stderr, ) break else: attachment = ET.Element("attachment") attachment.attrib["type"] = attach_type.lower() attachment.attrib["hash"] = checksum attachment.text = attachment_file_name paper.append(attachment) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f"-> added attachment {attachment_file_name} to the XML", file=sys.stderr) else: print(f"Paper {anthology_id} not found in the Anthology", file=sys.stderr) # Make sure directory exists output_dir = os.path.join(args.attachment_root, collection_id[0], collection_id) if not os.path.exists(output_dir): # print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) # Copy file dest_path = os.path.join(output_dir, attachment_file_name) if os.path.exists(dest_path) and not overwrite: print( f"-> target file {dest_path} already in place, refusing to overwrite", file=sys.stderr, ) return None shutil.copy(input_file_path, dest_path) os.chmod(dest_path, 0o644) print(f"-> copied {input_file_path} to {dest_path} and fixed perms", file=sys.stderr) # Clean up if path.startswith("http"): os.remove(input_file_path) return dest_path