Python AnthologyIndex примеры использования

Язык программирования: Python

Пространство имен/Пакет: anthology.index

Класс/Тип: AnthologyIndex

Примеров на hotexamples.com: 5

Python AnthologyIndex - 5 примеров найдено. Это лучшие примеры Python кода для anthology.index.AnthologyIndex, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

AnthologyIndex(5)

get_ids(2)

bibkeys(1)

Основные методы

AnthologyIndex (5)

get_ids (2)

bibkeys (1)

Пример #1

Показать файл

Файл: ingest.py Проект: sjmielke/acl-anthology

        "--ingest-date",
        "-d",
        type=str,
        default=today,
        help="Ingestion date as YYYY-MM-DD. Default: %(default)s.",
    )
    parser.add_argument(
        "--append",
        "-a",
        action="store_true",
        help="Append to existing volume instead of quitting.",
    )
    args = parser.parse_args()

    people = AnthologyIndex(None,
                            srcdir=os.path.join(os.path.dirname(sys.argv[0]),
                                                "..", "data"))

    pdf_directory = os.path.dirname(args.infile)
    tree_being_added = etree.parse(args.infile)

    # Ensure nested format
    root_being_added = make_nested(tree_being_added.getroot(),
                                   pdf_path=os.path.dirname(args.infile))
    collection_id = root_being_added.attrib["id"]

    # Ensure names are properly identified
    ambiguous = {}
    for paper in root_being_added.findall(".//paper"):
        anth_id = build_anthology_id(collection_id,
                                     paper.getparent().attrib["id"],

Пример #2

Показать файл

Файл: ingest.py Проект: namrathaurs/acl-anthology

def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                     "data")
    venue_keys = [
        venue["slug"].lower()
        for _, venue in VenueIndex(srcdir=anthology_datadir).items()
    ]

    # Build list of volumes, confirm uniqueness
    unseen_venues = []
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))

        venue_name = meta["abbrev"].lower()

        if venue_name not in venue_keys:
            unseen_venues.append(meta["abbrev"])

        meta["path"] = proceedings

        meta["collection_id"] = collection_id = (meta["year"] + "." +
                                                 meta["abbrev"].lower())
        volume_name = meta["volume"].lower()
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

    # Make sure all venues exist
    if len(unseen_venues) > 0:
        print("FATAL: The following venue(s) don't exist in venues.yaml")
        for venue in unseen_venues:
            print(f"- {venue}")
        print("Please create entries for them and re-ingest.")
        sys.exit(1)

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"].lower()
        year = meta["year"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        # copy the book
        book_src_filename = meta["abbrev"] + "-" + year
        book_src_path = os.path.join(root_path, book_src_filename) + ".pdf"
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            if not args.dry_run and not os.path.exists(book_dest_path):
                log(f"Copying {book_src_path} -> {book_dest_path}",
                    args.dry_run)
                shutil.copyfile(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                if not args.dry_run and not os.path.exists(pdf_dest_path):
                    log(f"Copying {pdf_src_path} -> {pdf_dest_path}",
                        args.dry_run)
                    shutil.copyfile(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                venue_name)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                attachment_file_path = os.path.join(root_path, "additional",
                                                    attachment_file)
                match = re.match(
                    rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$",
                    attachment_file)
                if match is None:
                    print(
                        f"* Warning: no attachment match for {attachment_file}",
                        file=sys.stderr,
                    )
                    sys.exit(2)

                paper_num, type_, ext = match.groups()
                paper_num = int(paper_num)

                file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                dest_path = os.path.join(attachments_dest_dir, file_name)
                if not args.dry_run and not os.path.exists(dest_path):
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    shutil.copyfile(attachment_file_path, dest_path)

                collections[collection_id][volume_name][paper_num][
                    "attachments"].append((dest_path, type_))

    people = AnthologyIndex(None, srcdir=anthology_datadir)

    def disambiguate_name(node, anth_id):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]

    for collection_id, collection in collections.items():
        # Newly added volumes, so we can normalize and name-disambig later
        newly_added_volumes = []

        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        if os.path.exists(collection_file):
            root_node = etree.parse(collection_file).getroot()
        else:
            root_node = make_simple_element("collection",
                                            attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume",
                attrib={
                    "id": volume_id,
                    "ingest-date": args.ingest_date
                },
            )

            # Replace the existing one if present
            existing_volume_node = root_node.find(
                f"./volume[@id='{volume_id}']")
            for i, child in enumerate(root_node):
                if child.attrib["id"] == volume_id:
                    root_node[i] = volume_node
                    break
            else:
                root_node.append(volume_node)

            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)
                # print(etree.tostring(paper_node, pretty_print=True))

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for author_or_editor in chain(
                            paper_node.findall("./author"),
                            paper_node.findall("./editor")):
                        meta_node.append(author_or_editor)
                        author_or_editor.tag = "editor"
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={
                                "hash": compute_hash_from_file(book_dest_path)
                            },
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for path, type_ in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=os.path.basename(path),
                        attrib={
                            "type": type_,
                            "hash": compute_hash_from_file(path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

                # Normalize
                for oldnode in paper_node:
                    normalize(oldnode, informat="latex")

                for name_node in chain(paper_node.findall("./author"),
                                       paper_node.findall("./editor")):
                    disambiguate_name(name_node, paper_id_full)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)

Пример #3

Показать файл

def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    # Build list of volumes, confirm uniqueness
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))
        meta["path"] = proceedings

        meta["collection_id"] = collection_id = (meta["year"] + "." +
                                                 meta["abbrev"].lower())
        volume_name = meta["volume_name"]
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume_name"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        print(f"VOLUME: {volume}")

        # copy the book
        book_src_filename = meta["abbrev"] + "-" + meta["year"]
        book_src_path = os.path.join(root_path, book_src_filename) + ".pdf"
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run)
            if not args.dry_run:
                shutil.copyfile(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # names are {abbrev}{number}.pdf
            abbrev = meta["abbrev"]
            match = re.match(rf"{abbrev}(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                log(
                    f"Copying [{paper_id_full}] {pdf_src_path} -> {pdf_dest_path}",
                    args.dry_run,
                )
                if not args.dry_run:
                    shutil.copyfile(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                collection_id)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                match = re.match(rf"{abbrev}(\d+)_(\w+)\.(\w+)")
                if match is not None:
                    paper_num, type_, ext = match.groups()
                    paper_num = int(paper_num)

                    file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                    dest_path = os.path.join(attachments_dest_dir, file_name)
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    if not args.dry_run:
                        shutil.copyfile(attachment_file, dest_path)

                    collections[collection_id][volume_name][paper_num][
                        "attachments"].append(dest_path)

    people = AnthologyIndex(None,
                            srcdir=os.path.join(os.path.dirname(sys.argv[0]),
                                                "..", "data"))

    for collection_id, collection in collections.items():
        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        root_node = make_simple_element("collection",
                                        attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element("volume",
                                              attrib={"id": volume_id},
                                              parent=root_node)
            meta = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)
                # print(etree.tostring(paper_node, pretty_print=True))

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta.append(title_node)
                    for editor in paper_node.findall("editor"):
                        meta.append(editor)
                    meta.append(paper_node.find("publisher"))
                    meta.append(paper_node.find("address"))
                    meta.append(paper_node.find("month"))
                    meta.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            parent=meta)

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                for attachment in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=attachment.path,
                        attrib={
                            "type": attachment.type,
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

        # Normalize
        for paper in root_node.findall(".//paper"):
            for oldnode in paper:
                normalize(oldnode, informat="latex")

        # Ensure names are properly identified
        ambiguous = {}
        for paper in root_node.findall(".//paper"):
            anth_id = build_anthology_id(collection_id,
                                         paper.getparent().attrib["id"],
                                         paper.attrib["id"])

        for node in chain(paper.findall("author"), paper.findall("editor")):
            name = PersonName.from_element(node)
            ids = people.get_ids(name)
            if len(ids) > 1:
                print(
                    f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}"
                )
                ambiguous[anth_id] = (name, ids)

                node.attrib["id"] = ids[0]

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)

Пример #4

Показать файл

def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                     "data")
    venue_index = VenueIndex(srcdir=anthology_datadir)
    venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()]

    sig_index = SIGIndex(srcdir=anthology_datadir)

    # Build list of volumes, confirm uniqueness
    unseen_venues = []
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))

        venue_abbrev = meta["abbrev"]
        venue_slug = venue_index.get_slug(venue_abbrev)

        if str(datetime.now().year) in venue_abbrev:
            print(
                f"Fatal: Venue assembler put year in acronym: '{venue_abbrev}'"
            )
            sys.exit(1)

        if venue_slug not in venue_keys:
            unseen_venues.append((venue_slug, venue_abbrev, meta["title"]))

        meta["path"] = proceedings

        meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug
        volume_name = meta["volume"].lower()
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

        if "sig" in meta:
            print(
                f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:"
            )
            print(f"  - {meta['year']}:")
            print(f"    - {volume_full_id} # {meta['booktitle']}")

    # Make sure all venues exist
    if len(unseen_venues) > 0:
        for venue in unseen_venues:
            slug, abbrev, title = venue
            print(f"Creating venue '{abbrev}' ({title})")
            venue_index.add_venue(abbrev, title)
        venue_index.dump(directory=anthology_datadir)

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"].lower()
        year = meta["year"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        # copy the book
        book_src_filename = f'{year}.{meta["abbrev"]}-{volume_name}.pdf'
        book_src_path = os.path.join(root_path, book_src_filename)
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            if not args.dry_run:
                maybe_copy(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # Skip . files
            if os.path.basename(pdf_file).startswith("."):
                continue

            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                if not args.dry_run:
                    maybe_copy(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                venue_name)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                if os.path.basename(attachment_file).startswith("."):
                    continue
                attachment_file_path = os.path.join(root_path, "additional",
                                                    attachment_file)
                match = re.match(
                    rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$",
                    attachment_file)
                if match is None:
                    print(
                        f"* Warning: no attachment match for {attachment_file}",
                        file=sys.stderr,
                    )
                    sys.exit(2)

                paper_num, type_, ext = match.groups()
                paper_num = int(paper_num)

                file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                dest_path = os.path.join(attachments_dest_dir, file_name)
                if not args.dry_run and not os.path.exists(dest_path):
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    shutil.copyfile(attachment_file_path, dest_path)

                collections[collection_id][volume_name][paper_num][
                    "attachments"].append((dest_path, type_))

    people = AnthologyIndex(None, srcdir=anthology_datadir)

    def correct_caps(person, name_node, anth_id):
        """
        Many people submit their names in "ALL CAPS" or "all lowercase".
        Correct this with heuristics.
        """
        name = name_node.text
        if name.islower() or name.isupper():
            # capitalize all parts
            corrected = " ".join(
                list(map(lambda x: x.capitalize(), name.split())))
            print(
                f"-> Correcting capitalization of '{name}' to '{corrected}'",
                file=sys.stderr,
            )
            name_node.text = corrected

    def disambiguate_name(node, anth_id):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]

    for collection_id, collection in collections.items():
        # Newly added volumes, so we can normalize and name-disambig later
        newly_added_volumes = []

        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        if os.path.exists(collection_file):
            root_node = etree.parse(collection_file).getroot()
        else:
            root_node = make_simple_element("collection",
                                            attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume",
                attrib={
                    "id": volume_id,
                    "ingest-date": args.ingest_date
                },
            )

            # Replace the existing one if present
            existing_volume_node = root_node.find(
                f"./volume[@id='{volume_id}']")
            for i, child in enumerate(root_node):
                if child.attrib["id"] == volume_id:
                    root_node[i] = volume_node
                    break
            else:
                root_node.append(volume_node)

            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for author_or_editor in chain(
                            paper_node.findall("./author"),
                            paper_node.findall("./editor")):
                        meta_node.append(author_or_editor)
                        author_or_editor.tag = "editor"
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={
                                "hash": compute_hash_from_file(book_dest_path)
                            },
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for path, type_ in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=os.path.basename(path),
                        attrib={
                            "type": type_,
                            "hash": compute_hash_from_file(path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

                # Normalize
                for oldnode in paper_node:
                    normalize(oldnode, informat="latex")

                # Adjust the language tag
                language_node = paper_node.find("./language")
                if language_node is not None:
                    try:
                        lang = iso639.languages.get(name=language_node.text)
                    except KeyError:
                        raise Exception(
                            f"Can't find language '{language_node.text}'")
                    language_node.text = lang.part3
                    print(language_node.text)

                # Fix author names
                for name_node in chain(paper_node.findall("./author"),
                                       paper_node.findall("./editor")):
                    disambiguate_name(name_node, paper_id_full)
                    person = PersonName.from_element(name_node)
                    for name_part in name_node:
                        correct_caps(person, name_part, paper_id_full)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)

Пример #5

Показать файл

Файл: ingest.py Проект: dbonadiman/acl-anthology

    parser = argparse.ArgumentParser()
    parser.add_argument('infile')
    parser.add_argument(
        '--ingest-date',
        '-d',
        type=str,
        default=today,
        help='Ingestion date as YYYY-MM-DD. Default: %(default)s.')
    parser.add_argument('--append',
                        '-a',
                        action='store_true',
                        help='Append to existing volume instead of quitting.')
    args = parser.parse_args()

    people = AnthologyIndex(None,
                            srcdir=os.path.join(os.path.dirname(sys.argv[0]),
                                                '..', 'data'))

    tree_being_added = etree.parse(args.infile)

    # Ensure nested format
    root_being_added = make_nested(tree_being_added.getroot())
    collection_id = root_being_added.attrib['id']

    # Ensure names are properly identified
    ambiguous = {}
    for paper in root_being_added.findall('.//paper'):
        anth_id = build_anthology_id(collection_id,
                                     paper.getparent().attrib['id'],
                                     paper.attrib['id'])