示例#1
0
def update_from_dblp(commit=False):
    """
    Pull data from DBLP and update database.

    .. note :: Existing entries are not updated. This is deliberate.

    :param commit: if `True` commit result to disk.

    """
    new = []
    for group_member, predicate in dblp_pids():
        logging.info("Fetching user '{group_member}'".format(group_member=group_member))
        root = dblp_fetch(group_member)
        publications = dblp_parse(root)

        for publication in publications:
            # we may have added the authors to the DB in the meantime, avoid duplicates by rechecking
            publication.authors = [
                Author.from_dblp_pid(session, pid=author.dblp_pid, name=author.name)
                for author in publication.authors
            ]
            if publication.visibility is None and predicate(publication):
                logging.info("Added '{publication}'".format(publication=publication))
                publication.visibility = True
                new.append(publication)

            if publication.id is None:
                session.add(publication)

    if commit:
        session.commit()

    return tuple(new)
示例#2
0
def dblp_parse(root):
    """
    Parse DBLP XML

    :param root: `xml.etree.ElementTree` output of `dblp_fetch`
    :returns: a list of `Publication`s

    """

    publications = []

    for child in root:
        if not child.tag == "r":
            continue

        publication = list(child)[0]

        dblp_key = publication.attrib["key"]
        mdate = datetime.date.fromisoformat(publication.attrib["mdate"])

        publication_type = None
        if publication.tag == "article":
            if "publtype" in publication.attrib and publication.attrib["publtype"] == "informal":
                publication_type = "informal"
            else:
                publication_type = "article"
        elif publication.tag in PUBLICATION_TYPES:
            publication_type = publication.tag
        else:
            raise ValueError(
                "Type of publication for '%s' not understood" % ET.tostring(publication)
            )

        author_tag = "editor" if publication_type in ("proceedings",) else "author"

        authors = []
        for author in publication.findall(author_tag):
            author_name = author.text
            # Foo Bar 0001 is a thing on DBLP
            author_name = re.match("([^0-9]*)([0-9]+)?", author_name).group(1).strip()
            authors.append(Author.from_dblp_pid(session, author.attrib["pid"], author_name))

        # many-to-many relations don't preserve order but author order can matter so we store it manually
        author_order = Publication.author_orderf(authors)

        title = publication.findtext("title")
        if title.endswith("."):
            title = title[:-1]

        year = int(publication.findtext("year"))
        url = publication.findtext("ee")
        dblp_url = publication.findtext("url")
        pages = publication.findtext("pages", "")

        if publication_type in ("article", "informal"):
            venue = publication.findtext("journal")
        elif publication_type == "inproceedings":
            venue = publication.findtext("booktitle")
        elif publication_type == "incollection":
            venue = publication.findtext("booktitle")
        elif publication.tag == "phdthesis":
            venue = publication.findtext("school")
        elif publication.tag == "book":
            venue = publication.findtext("publisher")
        elif publication.tag == "proceedings":
            venue = publication.findtext("publisher")
        else:
            raise ValueError(
                "Type of publication for '%s' not understood when parsing for venue"
                % ET.tostring(publication)
            )

        volume = publication.findtext("volume", "")
        number = publication.findtext("number", "")
        # IACR ePrint is so important to us we treat is specially
        if venue == "IACR Cryptol. ePrint Arch.":
            number = re.match("http(s)?://eprint.iacr.org/([0-9]{4})/([0-9]+)", url).group(3)

        publications.append(
            # get it from DB if it exists, otherwise create new entry
            Publication.from_dblp_key(
                session,
                key=dblp_key,
                type=publication_type,
                authors=authors,
                author_order=author_order,
                title=title,
                pages=pages,
                venue=venue,
                volume=volume,
                number=number,
                year=year,
                url=url,
                dblp_url=dblp_url,
                dblp_mdate=mdate,
                visibility=None,
            )
        )
        logging.debug("Found '{publication}'".format(publication=publications[-1]))

    return publications