def create_bibtex(anthology, trgdir, clean=False):
    """Creates .bib files for all papers."""
    if not check_directory("{}/papers".format(trgdir), clean=clean):
        return
    if not check_directory("{}/volumes".format(trgdir), clean=clean):
        return

    log.info("Creating BibTeX files for all papers...")
    with gzip.open(
        "{}/anthology.bib.gz".format(trgdir), "wt", encoding="utf-8"
    ) as file_full:
        for volume_id, volume in tqdm(anthology.volumes.items()):
            volume_dir = "{}/papers/{}/{}".format(trgdir, volume_id[0], volume_id[:3])
            if not os.path.exists(volume_dir):
                os.makedirs(volume_dir)
            with open(
                "{}/volumes/{}.bib".format(trgdir, volume_id), "w"
            ) as file_volume:
                for paper in volume:
                    with open(
                        "{}/{}.bib".format(volume_dir, paper.full_id), "w"
                    ) as file_paper:
                        contents = paper.as_bibtex()
                        file_paper.write(contents)
                        file_volume.write(contents)
                        file_volume.write("\n")
                        file_full.write(contents)
                        file_full.write("\n")
示例#2
0
def create_bibtex(anthology, trgdir, clean=False):
    """Creates .bib files for all papers."""
    if not check_directory("{}/papers".format(trgdir), clean=clean):
        return
    if not check_directory("{}/volumes".format(trgdir), clean=clean):
        return

    log.info("Creating BibTeX files for all papers...")
    with gzip.open(
        "{}/anthology.bib.gz".format(trgdir), "wt", encoding="utf-8"
    ) as file_anthology, gzip.open(
        "{}/anthology+abstracts.bib.gz".format(trgdir), "wt", encoding="utf-8"
    ) as file_anthology_with_abstracts:
        for volume_id, volume in tqdm(
            sorted(anthology.volumes.items(), key=volume_sorter, reverse=True)
        ):
            volume_dir = trgdir
            if not os.path.exists(volume_dir):
                os.makedirs(volume_dir)
            with open("{}/volumes/{}.bib".format(trgdir, volume_id), "w") as file_volume:
                for paper in volume:
                    with open(
                        "{}/{}.bib".format(volume_dir, paper.full_id), "w"
                    ) as file_paper:
                        contents = paper.as_bibtex()
                        print(contents, file=file_paper)
                        print(contents, file=file_anthology_with_abstracts)

                        concise_contents = paper.as_bibtex(concise=True)
                        print(concise_contents, file=file_volume)
                        print(concise_contents, file=file_anthology)
示例#3
0
def create_bibtex(anthology, trgdir, clean=False):
    """Creates .bib files for all papers."""
    if not check_directory("{}/papers".format(trgdir), clean=clean):
        return
    if not check_directory("{}/volumes".format(trgdir), clean=clean):
        return

    log.info("Creating BibTeX files for all papers...")
    with gzip.open(
        "{}/anthology.bib.gz".format(trgdir), "wt", encoding="utf-8"
    ) as file_full:
        for volume_id, volume in tqdm(anthology.volumes.items()):
            volume_dir = "{}/papers/{}/{}".format(trgdir, volume_id[0], volume_id[:3])
            if not os.path.exists(volume_dir):
                os.makedirs(volume_dir)
            with open(
                "{}/volumes/{}.bib".format(trgdir, volume_id), "w"
            ) as file_volume:
                for paper in volume:
                    with open(
                        "{}/{}.bib".format(volume_dir, paper.full_id), "w"
                    ) as file_paper:
                        contents = paper.as_bibtex()
                        file_paper.write(contents)
                        file_volume.write(contents)
                        file_volume.write("\n")
                        file_full.write(contents)
                        file_full.write("\n")
示例#4
0
def export_anthology(anthology, outdir, clean=False, dryrun=False):
    # Prepare paper index
    papers = defaultdict(dict)
    for id_, paper in anthology.papers.items():
        log.debug("export_anthology: processing paper '{}'".format(id_))
        data = paper.as_dict()
        data["title_html"] = paper.get_title("html")
        del data["xml_title"]
        if "xml_booktitle" in data:
            data["booktitle_html"] = paper.get_booktitle("html")
            del data["xml_booktitle"]
        if "xml_abstract" in data:
            data["abstract_html"] = paper.get_abstract("html")
            del data["xml_abstract"]
        if "author" in data:
            data["author"] = [
                anthology.people.resolve_name(name, id_)
                for name, id_ in data["author"]
            ]
        if "editor" in data:
            data["editor"] = [
                anthology.people.resolve_name(name, id_)
                for name, id_ in data["editor"]
            ]
        papers[paper.top_level_id][paper.full_id] = data

    # Prepare people index
    people = defaultdict(dict)
    for id_ in anthology.people.personids():
        name = anthology.people.get_canonical_name(id_)
        log.debug("export_anthology: processing person '{}'".format(
            repr(name)))
        data = name.as_dict()
        data["slug"] = id_
        if id_ in anthology.people.comments:
            data["comment"] = anthology.people.comments[id_]
        if id_ in anthology.people.similar:
            data["similar"] = sorted(anthology.people.similar[id_])
        data["papers"] = sorted(
            anthology.people.get_papers(id_),
            key=lambda p: anthology.papers.get(p).get("year"),
            reverse=True,
        )
        data["coauthors"] = sorted(
            [[co_id, count]
             for (co_id, count) in anthology.people.get_coauthors(id_)],
            key=lambda p: p[1],
            reverse=True,
        )
        data["venues"] = sorted(
            [[venue, count] for (venue, count) in anthology.people.get_venues(
                anthology.venues, id_).items()],
            key=lambda p: p[1],
            reverse=True,
        )
        variants = [
            n for n in anthology.people.get_used_names(id_) if n != name
        ]
        if len(variants) > 0:
            data["variant_entries"] = [name.as_dict() for name in variants]
        people[id_[0]][id_] = data

    # Prepare volume index
    volumes = {}
    for id_, volume in anthology.volumes.items():
        log.debug("export_anthology: processing volume '{}'".format(id_))
        data = volume.attrib
        data["title_html"] = volume.get_title("html")
        del data["xml_title"]
        if "xml_abstract" in data:
            del data["xml_abstract"]
        data["papers"] = volume.paper_ids
        if "author" in data:
            data["author"] = [
                anthology.people.resolve_name(name, id_)
                for name, id_ in data["author"]
            ]
        if "editor" in data:
            data["editor"] = [
                anthology.people.resolve_name(name, id_)
                for name, id_ in data["editor"]
            ]
        volumes[volume.full_id] = data

    # Prepare venue index
    venues = {}
    for acronym, data in anthology.venues.items():
        data = data.copy()
        data["volumes_by_year"] = {
            year: sorted(
                filter(lambda k: volumes[k]["year"] == year, data["volumes"]))
            for year in sorted(data["years"])
        }
        data["years"] = sorted(list(data["years"]))
        del data["volumes"]
        venues[acronym] = data

    # Prepare SIG index
    sigs = {}
    for acronym, sig in anthology.sigs.items():
        data = {
            "name": sig.name,
            "slug": sig.slug,
            "url": sig.url,
            "volumes_by_year": sig.volumes_by_year,
            "years": sorted([str(year) for year in sig.years]),
        }
        sigs[acronym] = data

    # Dump all
    if not dryrun:
        # Create directories
        for subdir in ("", "papers", "people"):
            target_dir = "{}/{}".format(outdir, subdir)
            if not check_directory(target_dir, clean=clean):
                return

        progress = tqdm(total=len(papers) + len(people) + 7)
        for top_level_id, paper_list in papers.items():
            with open("{}/papers/{}.yaml".format(outdir, top_level_id),
                      "w") as f:
                yaml.dump(paper_list, Dumper=Dumper, stream=f)
            progress.update()

        with open("{}/volumes.yaml".format(outdir), "w") as f:
            yaml.dump(volumes, Dumper=Dumper, stream=f)
        progress.update(5)

        with open("{}/venues.yaml".format(outdir), "w") as f:
            yaml.dump(venues, Dumper=Dumper, stream=f)
        progress.update()

        with open("{}/sigs.yaml".format(outdir), "w") as f:
            yaml.dump(sigs, Dumper=Dumper, stream=f)
        progress.update()

        for first_letter, people_list in people.items():
            with open("{}/people/{}.yaml".format(outdir, first_letter),
                      "w") as f:
                yaml.dump(people_list, Dumper=Dumper, stream=f)
            progress.update()
        progress.close()
def export_anthology(anthology, outdir, clean=False, dryrun=False):
    # Prepare paper index
    papers = defaultdict(dict)
    for id_, paper in anthology.papers.items():
        log.debug("export_anthology: processing paper '{}'".format(id_))
        data = paper.as_dict()
        data["title_html"] = paper.get_title("html")
        if "xml_title" in data:
            del data["xml_title"]
        if "xml_booktitle" in data:
            data["booktitle_html"] = paper.get_booktitle("html")
            del data["xml_booktitle"]
        if "xml_abstract" in data:
            data["abstract_html"] = paper.get_abstract("html")
            del data["xml_abstract"]
        if "xml_url" in data:
            del data["xml_url"]
        if "author" in data:
            data["author"] = [
                anthology.people.resolve_name(name, id_)
                for name, id_ in data["author"]
            ]
        if "editor" in data:
            data["editor"] = [
                anthology.people.resolve_name(name, id_)
                for name, id_ in data["editor"]
            ]
        papers[paper.collection_id][paper.full_id] = data

    # Prepare people index
    people = defaultdict(dict)
    for id_ in anthology.people.personids():
        name = anthology.people.get_canonical_name(id_)
        log.debug("export_anthology: processing person '{}'".format(
            repr(name)))
        data = name.as_dict()
        data["slug"] = id_
        if id_ in anthology.people.comments:
            data["comment"] = anthology.people.comments[id_]
        if id_ in anthology.people.similar:
            data["similar"] = sorted(anthology.people.similar[id_])
        data["papers"] = sorted(
            anthology.people.get_papers(id_),
            key=lambda p: anthology.papers.get(p).get("year"),
            reverse=True,
        )
        data["coauthors"] = sorted(
            [[co_id, count]
             for (co_id, count) in anthology.people.get_coauthors(id_)],
            key=lambda p: p[1],
            reverse=True,
        )
        data["venues"] = sorted(
            [[venue, count] for (venue, count) in anthology.people.get_venues(
                anthology.venues, id_).items()],
            key=lambda p: p[1],
            reverse=True,
        )
        variants = [
            n for n in anthology.people.get_used_names(id_)
            if n.first != name.first or n.last != name.last
        ]
        if len(variants) > 0:
            data["variant_entries"] = [
                name.as_dict() for name in sorted(variants)
            ]
        people[id_[0]][id_] = data

    # Prepare volume index
    volumes = {}
    for id_, volume in anthology.volumes.items():
        log.debug("export_anthology: processing volume '{}'".format(id_))
        data = volume.as_dict()
        data["title_html"] = volume.get_title("html")
        del data["xml_booktitle"]
        if "xml_abstract" in data:
            del data["xml_abstract"]
        if "xml_url" in data:
            del data["xml_url"]
        data["has_abstracts"] = volume.has_abstracts
        data["papers"] = volume.paper_ids
        if "author" in data:
            data["author"] = [
                anthology.people.resolve_name(name, id_)
                for name, id_ in data["author"]
            ]
        if "editor" in data:
            data["editor"] = [
                anthology.people.resolve_name(name, id_)
                for name, id_ in data["editor"]
            ]
        volumes[volume.full_id] = data

    class SortedVolume:
        """Keys for sorting volumes so they appear in a more reasonable order.
        Takes the parent venue being sorted under, along with its letter,
        and the Anthology ID of the current volume. For example, LREC 2020
        has the following joint events, which get sorted in the following manner:

        ['2020.lrec-1', '2020.aespen-1', '2020.ai4hi-1',
        '2020.bucc-1', '2020.calcs-1', '2020.cllrd-1', '2020.clssts-1',
        '2020.cmlc-1', '2020.computerm-1', '2020.framenet-1', '2020.gamnlp-1',
        '2020.globalex-1', '2020.isa-1', '2020.iwltp-1',
        '2020.ldl-1', '2020.lincr-1', '2020.lr4sshoc-1', '2020.lt4gov-1',
        '2020.lt4hala-1 ', '2020.multilingualbio-1', '2020.onion-1',
        '2020.osact-1', '2020.parlaclarin-1', '2020.rail-1', '2020.readi-1',
        '2020.restup-1', '2020.sltu-1 ', '2020.stoc-1', '2020.trac-1',
        '2020.wac-1', '2020.wildre-1']
        """
        def __init__(self, acronym, letter, anth_id):
            self.parent_venue = acronym.lower()
            self.anth_id = anth_id

            collection_id, self.volume_id, _ = deconstruct_anthology_id(
                anth_id)
            if is_newstyle_id(collection_id):
                self.venue = collection_id.split(".")[1]
                self.is_parent_venue = self.venue == self.parent_venue
            else:
                self.venue = collection_id[0]
                self.is_parent_venue = self.venue == letter

        def __str__(self):
            return self.anth_id

        def __eq__(self, other):
            """We define equivalence at the venue (not volume) level in order
            to preserve the sort order found in the XML"""
            return self.venue == other.venue

        def __lt__(self, other):
            """First parent volumes, then sort by venue name"""
            if self.is_parent_venue == other.is_parent_venue:
                return self.venue < other.venue
            return self.is_parent_venue and not other.is_parent_venue

    # Prepare venue index
    venues = {}
    for acronym, data in anthology.venues.items():
        letter = data.get("oldstyle_letter", "W")
        data = data.copy()
        data["volumes_by_year"] = {
            year: sorted(
                filter(lambda k: volumes[k]["year"] == year, data["volumes"]),
                key=lambda x: SortedVolume(acronym, letter, x),
            )
            for year in sorted(data["years"])
        }
        data["years"] = sorted(list(data["years"]))
        del data["volumes"]
        venues[acronym] = data

    # Prepare SIG index
    sigs = {}
    for acronym, sig in anthology.sigs.items():
        data = {
            "name": sig.name,
            "slug": sig.slug,
            "url": sig.url,
            "volumes_by_year": sig.volumes_by_year,
            "years": sorted([str(year) for year in sig.years]),
        }
        sigs[acronym] = data

    # Dump all
    if not dryrun:
        # Create directories
        for subdir in ("", "papers", "people"):
            target_dir = "{}/{}".format(outdir, subdir)
            if not check_directory(target_dir, clean=clean):
                return

        progress = tqdm(total=len(papers) + len(people) + 7)
        for collection_id, paper_list in papers.items():
            with open("{}/papers/{}.yaml".format(outdir, collection_id),
                      "w") as f:
                yaml.dump(paper_list, Dumper=Dumper, stream=f)
            progress.update()

        with open("{}/volumes.yaml".format(outdir), "w") as f:
            yaml.dump(volumes, Dumper=Dumper, stream=f)
        progress.update(5)

        with open("{}/venues.yaml".format(outdir), "w") as f:
            yaml.dump(venues, Dumper=Dumper, stream=f)
        progress.update()

        with open("{}/sigs.yaml".format(outdir), "w") as f:
            yaml.dump(sigs, Dumper=Dumper, stream=f)
        progress.update()

        for first_letter, people_list in people.items():
            with open("{}/people/{}.yaml".format(outdir, first_letter),
                      "w") as f:
                yaml.dump(people_list, Dumper=Dumper, stream=f)
            progress.update()
        progress.close()
示例#6
0
def export_anthology(anthology, outdir, clean=False, dryrun=False):
    # Prepare paper index
    papers = defaultdict(dict)
    for id_, paper in anthology.papers.items():
        log.debug("export_anthology: processing paper '{}'".format(id_))
        data = paper.as_dict()
        data["title_html"] = paper.get_title("html")
        del data["xml_title"]
        if "xml_booktitle" in data:
            data["booktitle_html"] = paper.get_booktitle("html")
            del data["xml_booktitle"]
        if "xml_abstract" in data:
            data["abstract_html"] = paper.get_abstract("html")
            del data["xml_abstract"]
        if "author" in data:
            data["author"] = [
                anthology.people.resolve_name(name, id_) for name, id_ in data["author"]
            ]
        if "editor" in data:
            data["editor"] = [
                anthology.people.resolve_name(name, id_) for name, id_ in data["editor"]
            ]
        papers[paper.top_level_id][paper.full_id] = data

    # Prepare people index
    people = defaultdict(dict)
    for id_ in anthology.people.personids():
        name = anthology.people.get_canonical_name(id_)
        log.debug("export_anthology: processing person '{}'".format(repr(name)))
        data = name.as_dict()
        data["slug"] = id_
        if id_ in anthology.people.comments:
            data["comment"] = anthology.people.comments[id_]
        if id_ in anthology.people.similar:
            data["similar"] = sorted(anthology.people.similar[id_])
        data["papers"] = sorted(
            anthology.people.get_papers(id_),
            key=lambda p: anthology.papers.get(p).get("year"),
            reverse=True,
        )
        data["coauthors"] = sorted(
            [[co_id, count] for (co_id, count) in anthology.people.get_coauthors(id_)],
            key=lambda p: p[1],
            reverse=True,
        )
        data["venues"] = sorted(
            [
                [venue, count]
                for (venue, count) in anthology.people.get_venues(
                    anthology.venues, id_
                ).items()
            ],
            key=lambda p: p[1],
            reverse=True,
        )
        variants = [n for n in anthology.people.get_used_names(id_) if n != name]
        if len(variants) > 0:
            data["variant_entries"] = [name.as_dict() for name in variants]
        people[id_[0]][id_] = data

    # Prepare volume index
    volumes = {}
    for id_, volume in anthology.volumes.items():
        log.debug("export_anthology: processing volume '{}'".format(id_))
        data = volume.attrib
        data["title_html"] = volume.get_title("html")
        del data["xml_title"]
        if "xml_abstract" in data:
            del data["xml_abstract"]
        data["papers"] = volume.paper_ids
        if "author" in data:
            data["author"] = [
                anthology.people.resolve_name(name, id_) for name, id_ in data["author"]
            ]
        if "editor" in data:
            data["editor"] = [
                anthology.people.resolve_name(name, id_) for name, id_ in data["editor"]
            ]
        volumes[volume.full_id] = data

    # Prepare venue index
    venues = {}
    for acronym, data in anthology.venues.items():
        data = data.copy()
        data["volumes_by_year"] = {
            year: sorted(filter(lambda k: volumes[k]["year"] == year, data["volumes"]))
            for year in sorted(data["years"])
        }
        data["years"] = sorted(list(data["years"]))
        del data["volumes"]
        venues[acronym] = data

    # Prepare SIG index
    sigs = {}
    for acronym, sig in anthology.sigs.items():
        data = {
            "name": sig.name,
            "slug": sig.slug,
            "url": sig.url,
            "volumes_by_year": sig.volumes_by_year,
            "years": sorted([str(year) for year in sig.years]),
        }
        sigs[acronym] = data

    # Dump all
    if not dryrun:
        # Create directories
        for subdir in ("", "papers", "people"):
            target_dir = "{}/{}".format(outdir, subdir)
            if not check_directory(target_dir, clean=clean):
                return

        progress = tqdm(total=len(papers) + len(people) + 7)
        for top_level_id, paper_list in papers.items():
            with open("{}/papers/{}.yaml".format(outdir, top_level_id), "w") as f:
                yaml.dump(paper_list, Dumper=Dumper, stream=f)
            progress.update()

        with open("{}/volumes.yaml".format(outdir), "w") as f:
            yaml.dump(volumes, Dumper=Dumper, stream=f)
        progress.update(5)

        with open("{}/venues.yaml".format(outdir), "w") as f:
            yaml.dump(venues, Dumper=Dumper, stream=f)
        progress.update()

        with open("{}/sigs.yaml".format(outdir), "w") as f:
            yaml.dump(sigs, Dumper=Dumper, stream=f)
        progress.update()

        for first_letter, people_list in people.items():
            with open("{}/people/{}.yaml".format(outdir, first_letter), "w") as f:
                yaml.dump(people_list, Dumper=Dumper, stream=f)
            progress.update()
        progress.close()