def update_from_dblp(commit=False): """ Pull data from DBLP and update database. .. note :: Existing entries are not updated. This is deliberate. :param commit: if `True` commit result to disk. """ new = [] for group_member, predicate in dblp_pids(): logging.info("Fetching user '{group_member}'".format(group_member=group_member)) root = dblp_fetch(group_member) publications = dblp_parse(root) for publication in publications: # we may have added the authors to the DB in the meantime, avoid duplicates by rechecking publication.authors = [ Author.from_dblp_pid(session, pid=author.dblp_pid, name=author.name) for author in publication.authors ] if publication.visibility is None and predicate(publication): logging.info("Added '{publication}'".format(publication=publication)) publication.visibility = True new.append(publication) if publication.id is None: session.add(publication) if commit: session.commit() return tuple(new)
def dblp_parse(root): """ Parse DBLP XML :param root: `xml.etree.ElementTree` output of `dblp_fetch` :returns: a list of `Publication`s """ publications = [] for child in root: if not child.tag == "r": continue publication = list(child)[0] dblp_key = publication.attrib["key"] mdate = datetime.date.fromisoformat(publication.attrib["mdate"]) publication_type = None if publication.tag == "article": if "publtype" in publication.attrib and publication.attrib["publtype"] == "informal": publication_type = "informal" else: publication_type = "article" elif publication.tag in PUBLICATION_TYPES: publication_type = publication.tag else: raise ValueError( "Type of publication for '%s' not understood" % ET.tostring(publication) ) author_tag = "editor" if publication_type in ("proceedings",) else "author" authors = [] for author in publication.findall(author_tag): author_name = author.text # Foo Bar 0001 is a thing on DBLP author_name = re.match("([^0-9]*)([0-9]+)?", author_name).group(1).strip() authors.append(Author.from_dblp_pid(session, author.attrib["pid"], author_name)) # many-to-many relations don't preserve order but author order can matter so we store it manually author_order = Publication.author_orderf(authors) title = publication.findtext("title") if title.endswith("."): title = title[:-1] year = int(publication.findtext("year")) url = publication.findtext("ee") dblp_url = publication.findtext("url") pages = publication.findtext("pages", "") if publication_type in ("article", "informal"): venue = publication.findtext("journal") elif publication_type == "inproceedings": venue = publication.findtext("booktitle") elif publication_type == "incollection": venue = publication.findtext("booktitle") elif publication.tag == "phdthesis": venue = publication.findtext("school") elif publication.tag == "book": venue = publication.findtext("publisher") elif publication.tag == "proceedings": venue = publication.findtext("publisher") else: raise ValueError( "Type of publication for '%s' not understood when parsing for venue" % ET.tostring(publication) ) volume = publication.findtext("volume", "") number = publication.findtext("number", "") # IACR ePrint is so important to us we treat is specially if venue == "IACR Cryptol. ePrint Arch.": number = re.match("http(s)?://eprint.iacr.org/([0-9]{4})/([0-9]+)", url).group(3) publications.append( # get it from DB if it exists, otherwise create new entry Publication.from_dblp_key( session, key=dblp_key, type=publication_type, authors=authors, author_order=author_order, title=title, pages=pages, venue=venue, volume=volume, number=number, year=year, url=url, dblp_url=dblp_url, dblp_mdate=mdate, visibility=None, ) ) logging.debug("Found '{publication}'".format(publication=publications[-1])) return publications