def handle_article(_: Any, article: ArticleType) -> bool: # type: ignore global counter global successes global failures global totalPapers counter += 1 try: if counter % 10000 == 0: print(f"{counter} papers processed.") if "author" not in article: return True # Fix if there is just one author. authorList: List[str] = [] if type(article["author"]) == list: authorList = article["author"] elif type(article["author"]) == str: authorList = [str(article["author"])] elif (type(article["author"]) is OrderedDict or type(article["author"]) is dict): authorList = [article["author"]["#text"]] # type: ignore else: print("***Unknown record type, skipping.***") return True authorsOnPaper = len(authorList) foundOneInDict = False or args.all if not args.all: for authorName in authorList: if (type(authorName) is OrderedDict or type(authorName) is dict): aName = authorName["#text"] # type: ignore else: aName = authorName aName = aName.strip() if aName in facultydict or args.all: foundOneInDict = True break with contextlib.suppress(KeyError): if aliasdict[aName] in facultydict: foundOneInDict = True break if reversealiasdict[aName] in facultydict: foundOneInDict = True break if not foundOneInDict: return True if "booktitle" in article: confname = Conference(article["booktitle"]) elif "journal" in article: confname = Conference(article["journal"]) else: return True if args.conference not in confname: return True if confname not in confdict: return True volume = article.get("volume", "0") number = article.get("number", "0") url = article.get("url", "") year = int(article.get("year", "-1")) pages = "" areaname = confdict[confname] # Special handling for PACMPL if areaname == Area("pacmpl"): confname = Conference(article["number"]) if confname in confdict: areaname = confdict[confname] else: return True elif confname == Conference("ACM Trans. Graph."): if year in TOG_SIGGRAPH_Volume: (vol, num) = TOG_SIGGRAPH_Volume[year] if (volume == str(vol)) and (number == str(num)): confname = Conference("SIGGRAPH") areaname = confdict[confname] if year in TOG_SIGGRAPH_Asia_Volume: (vol, num) = TOG_SIGGRAPH_Asia_Volume[year] if (volume == str(vol)) and (number == str(num)): confname = Conference("SIGGRAPH Asia") areaname = confdict[confname] elif confname == "IEEE Trans. Vis. Comput. Graph.": if year in TVCG_Vis_Volume: (vol, num) = TVCG_Vis_Volume[year] if (volume == str(vol)) and (number == str(num)): areaname = Area("vis") if year in TVCG_VR_Volume: (vol, num) = TVCG_VR_Volume[year] if (volume == str(vol)) and (number == str(num)): confname = Conference("VR") areaname = Area("vr") if "title" in article: title = Title("") if (type(article["title"]) is OrderedDict or type(article["title"]) is dict): title = Title(article["title"]["#text"]) # type: ignore else: title = Title(article["title"]) if "pages" in article: pages = article["pages"] pageCount = pagecount(pages) startPage = startpage(pages) else: pageCount = -1 startPage = -1 successes += 1 except TypeError: raise except BaseException: print(sys.exc_info()[0]) failures += 1 raise if countPaper(confname, year, volume, number, pages, startPage, pageCount, url, title): totalPapers += 1 for authorName in authorList: aName = "" if type(authorName) is OrderedDict or type(authorName) is dict: aName = authorName["#text"] # type: ignore elif type(authorName) is str: aName = authorName realName = aliasdict.get(aName, aName) affiliation = "" if realName in facultydict: affiliation = facultydict[realName] elif realName in aliasdict: affiliation = facultydict[aliasdict[realName]] elif realName in reversealiasdict: affiliation = facultydict[reversealiasdict[realName]] facultydict[realName] = affiliation if (affiliation and (realName in facultydict or realName in aliasdict or realName in reversealiasdict)) or args.all: log: LogType = { "name": realName.encode("utf-8"), "year": year, "title": title.encode("utf-8"), "conf": confname, "area": areaname, "institution": affiliation, "numauthors": authorsOnPaper, "volume": volume, "number": number, "startPage": startPage, "pageCount": pageCount, } tmplist: List[LogType] = authlogs.get(realName, []) tmplist.append(log) authlogs[realName] = tmplist interestingauthors[realName] += 1 authorscores[(realName, areaname, year)] += 1.0 authorscoresAdjusted[(realName, areaname, year)] += 1.0 / authorsOnPaper return True
def parseDBLP(facultydict): authlogs = {} interestingauthors = {} authorscores = {} authorscoresAdjusted = {} coauthors = {} papersWritten = {} counter = 0 with open('dblp.xml', mode='r') as f: # with gzip.open('dblp.xml.gz') as f: oldnode = None for (event, node) in ElementTree.iterparse(f, events=['start', 'end']): if (oldnode is not None): oldnode.clear() oldnode = node foundArticle = False inRange = False authorsOnPaper = 0 authorName = "" confname = "" year = -1 pageCount = -1 startPage = -1 foundOneInDict = False number = 0 volume = 0 if (node.tag == 'inproceedings' or node.tag == 'article'): # First, check if this is one of the conferences we are looking for. for child in node: if (child.tag == 'booktitle' or child.tag == 'journal'): confname = child.text if (confname in confdict): foundArticle = True if (child.tag == 'volume'): volume = child.text if (child.tag == 'number'): number = child.text if child.tag == 'year': if child.text is not None: year = int(child.text) if child.tag == 'pages': pageCount = pagecount(child.text) startPage = startpage(child.text) if child.tag == 'author': authorName = child.text if authorName is not None: authorName = authorName.strip() authorsOnPaper += 1 if authorName in facultydict: foundOneInDict = True if (not foundArticle): # Not one of our conferences. continue areaname = confdict[confname] # Special handling for ISMB. if (confname == 'Bioinformatics'): if ISMB_Bioinformatics.has_key(year): (vol, num) = ISMB_Bioinformatics[year] if (volume != str(vol)) or (number != str(num)): continue else: continue # Special handling for ICSE. if ((confname == 'ICSE') or (confname == 'ICSE (1)') or (confname == 'ICSE (2)')): if ICSE_ShortPaperStart.has_key(year): pageno = ICSE_ShortPaperStart[year] if startPage >= pageno: # Omit papers that start at or beyond this page, # since they are "short papers" (regardless of their length). continue # Check that dates are in the specified range. if ((year >= startyear) and (year <= endyear)): inRange = True if year == -1: # No year. print "NO YEAR WAT", confname continue tooFewPages = False if ((pageCount != -1) and (pageCount < pageCountThreshold)): tooFewPages = True exceptionConference = ((confname == 'SC') or (confname == 'SIGSOFT FSE') or (confname == 'PLDI') or (confname == 'ACM Trans. Graph.')) if ((pageCount == 0) and exceptionConference): tooFewPages = False # SPECIAL CASE FOR conferences that have incorrect entries (as of 6/22/2016). # Only skip papers with a very small paper count, # but above 1. Why? # DBLP has real papers with incorrect page counts # - usually a truncated single page. -1 means no # pages found at all => some problem with journal # entries in DBLP. # print "Skipping article with "+str(pageCount)+" pages." for child in node: if child.tag == 'author': authorName = child.text # if authorName is not None: # print authorName.encode('utf-8') + "," + areaname + "," + str(volume) + "," + str(number) + "," + str(year) + "," + str(pageCount) + "," + str(startPage) + "," + str(authorsOnPaper) if ((confname == 'ASE') and (pageCount <= 6)): tooFewPages = True if (not inRange) or (not foundOneInDict) or tooFewPages: continue # If we got here, we have a winner. for child in node: if child.tag == 'author': authorName = child.text authorName = authorName.strip() if authorName in facultydict: # print "here we go",authorName, confname, authorsOnPaper, year logstring = authorName.encode( 'utf-8') + " ; " + confname + " " + str(year) tmplist = authlogs.get(authorName, []) tmplist.append(logstring) authlogs[authorName] = tmplist interestingauthors[ authorName] = interestingauthors.get( authorName, 0) + 1 authorscores[( authorName, areaname, year)] = authorscores.get( (authorName, areaname, year), 0) + 1.0 authorscoresAdjusted[( authorName, areaname, year)] = authorscoresAdjusted.get( (authorName, areaname, year), 0) + 1.0 / authorsOnPaper return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
def parseDBLP(facultydict): authlogs = {} interestingauthors = {} authorscores = {} authorscoresAdjusted = {} coauthors = {} papersWritten = {} counter = 0 # with open('dblp.xml', mode='rb') as f: with gzip.open('dblp.xml.gz') as f: oldnode = None dtd = ElementTree.DTD(file='dblp.dtd') for (event, node) in ElementTree.iterparse(f, events=['start', 'end'], load_dtd=True): if (oldnode is not None): oldnode.clear() oldnode = node foundArticle = False authorsOnPaper = 0 authorName = "" authorList = [] confname = "" title = "" year = -1 pageCount = -1 startPage = -1 foundOneInDict = False number = 0 volume = 0 if node.tag == 'inproceedings' or node.tag == 'article': for child in node: if child.tag == 'booktitle' or child.tag == 'journal': if child.text is not None: confname = child.text if confname in confdict: areaname = confdict[confname] foundArticle = True elif child.tag == 'title': if child.text is not None: title = child.text elif child.tag == 'volume': volume = child.text elif child.tag == 'number': number = child.text elif child.tag == 'year': if child.text is not None: year = int(child.text) elif child.tag == 'pages': pageCount = pagecount(child.text) startPage = startpage(child.text) elif child.tag == 'url': url = child.text elif child.tag == 'author': if child.text is not None: authorName = child.text authorName = authorName.strip() authorList.append(authorName) authorsOnPaper += 1 if authorName in facultydict: foundOneInDict = True # One of our conferences? if not foundArticle: continue # Any authors in our affiliations? if not foundOneInDict: continue # One of the papers we count? if not countPaper(confname, year, volume, number, startPage, pageCount, url): continue # If we get here, we have a winner. for authorName in authorList: if authorName in facultydict: # print "here we go",authorName, confname, authorsOnPaper, year logstring = { 'name' : authorName.encode('utf-8'), 'conf' : confname, 'area' : areaname, 'year' : year, 'title' : title.encode('utf-8'), 'institution' : facultydict[authorName] } tmplist = authlogs.get(authorName, []) tmplist.append(logstring) authlogs[authorName] = tmplist interestingauthors[authorName] = interestingauthors.get(authorName, 0) + 1 authorscores[(authorName, areaname, year)] = authorscores.get((authorName, areaname, year), 0) + 1.0 authorscoresAdjusted[(authorName, areaname, year)] = authorscoresAdjusted.get((authorName, areaname, year), 0) + 1.0 / authorsOnPaper return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
def handle_article(_ : Any, article : ArticleType) -> bool: # type: ignore global counter global successes global failures global totalPapers counter += 1 try: if counter % 10000 == 0: print(str(counter)+ " papers processed.") if not 'author' in article: return True # Fix if there is just one author. authorList : List[str] = [] if type(article['author']) == list: authorList = article['author'] else: if type(article['author']) == str: authorList = [str(article['author'])] elif type(article['author']) is collections.OrderedDict: authorList = [article['author']["#text"]] # type: ignore else: print("***Unknown record type, skipping.***") return True authorsOnPaper = len(authorList) foundOneInDict = False or args.all if not args.all: for authorName in authorList: if type(authorName) is collections.OrderedDict: aName = authorName["#text"] # type: ignore else: aName = authorName aName = aName.strip() if aName in facultydict or args.all: foundOneInDict = True break try: if aliasdict[aName] in facultydict: foundOneInDict = True break if reversealiasdict[aName] in facultydict: foundOneInDict = True break except: pass if not foundOneInDict: return True if 'booktitle' in article: confname = Conference(article['booktitle']) elif 'journal' in article: confname = Conference(article['journal']) else: return True if not args.conference in confname: return True if not confname in confdict: return True volume = article.get('volume',"0") number = article.get('number',"0") url = article.get('url',"") year = int(article.get('year',"-1")) pages = "" areaname = confdict[confname] #Special handling for PACMPL if areaname == Area('pacmpl'): confname = Conference(article['number']) if confname in confdict: areaname = confdict[confname] else: return True elif confname == Conference('ACM Trans. Graph.'): if year in TOG_SIGGRAPH_Volume: (vol, num) = TOG_SIGGRAPH_Volume[year] if (volume == str(vol)) and (number == str(num)): confname = Conference('SIGGRAPH') areaname = confdict[confname] if year in TOG_SIGGRAPH_Asia_Volume: (vol, num) = TOG_SIGGRAPH_Asia_Volume[year] if (volume == str(vol)) and (number == str(num)): confname = Conference('SIGGRAPH Asia') areaname = confdict[confname] elif confname == 'IEEE Trans. Vis. Comput. Graph.': if year in TVCG_Vis_Volume: (vol, num) = TVCG_Vis_Volume[year] if (volume == str(vol)) and (number == str(num)): areaname = Area('vis') if year in TVCG_VR_Volume: (vol, num) = TVCG_VR_Volume[year] if (volume == str(vol)) and (number == str(num)): confname = Conference('VR') areaname = Area('vr') if 'title' in article: title = Title("") if type(article['title']) is collections.OrderedDict: title = Title(article['title']["#text"]) # type: ignore else: title = Title(article['title']) if 'pages' in article: pages = article['pages'] pageCount = pagecount(pages) startPage = startpage(pages) else: pageCount = -1 startPage = -1 successes += 1 except TypeError: raise except: print(sys.exc_info()[0]) failures += 1 raise if countPaper(confname, year, volume, number, pages, startPage, pageCount, url, title): totalPapers += 1 for authorName in authorList: aName = "" if type(authorName) is collections.OrderedDict: aName = authorName["#text"] # type: ignore elif type(authorName) is str: aName = authorName realName = aliasdict.get(aName, aName) affiliation = "" if realName in facultydict: affiliation = facultydict[realName] elif realName in aliasdict: affiliation = facultydict[aliasdict[realName]] elif realName in reversealiasdict: affiliation = facultydict[reversealiasdict[realName]] facultydict[realName] = affiliation if (affiliation and (realName in facultydict or realName in aliasdict or realName in reversealiasdict)) or args.all: log : LogType = { 'name' : realName.encode('utf-8'), 'year' : year, 'title' : title.encode('utf-8'), 'conf' : confname, 'area' : areaname, 'institution' : affiliation, 'numauthors' : authorsOnPaper, 'volume' : volume, 'number' : number, 'startPage' : startPage, 'pageCount' : pageCount } tmplist : List[LogType] = authlogs.get(realName, []) tmplist.append(log) authlogs[realName] = tmplist interestingauthors[realName] += 1 authorscores[(realName, areaname, year)] += 1.0 authorscoresAdjusted[(realName, areaname, year)] += 1.0 / authorsOnPaper return True
def parseDBLP(facultydict): authlogs = {} interestingauthors = {} authorscores = {} authorscoresAdjusted = {} coauthors = {} papersWritten = {} counter = 0 with open('dblp.xml', mode='r') as f: # with gzip.open('dblp.xml.gz') as f: oldnode = None for (event, node) in ElementTree.iterparse(f, events=['start', 'end']): if (oldnode is not None): oldnode.clear() oldnode = node foundArticle = False authorsOnPaper = 0 authorName = "" confname = "" title = "" year = -1 pageCount = -1 startPage = -1 foundOneInDict = False number = 0 volume = 0 if (node.tag == 'inproceedings' or node.tag == 'article'): # First, check if this is one of the conferences we are looking for. for child in node: if (child.tag == 'booktitle' or child.tag == 'journal'): confname = child.text if (confname in confdict): areaname = confdict[confname] foundArticle = True if (child.tag == 'title'): if child.text is not None: title = child.text if (child.tag == 'volume'): volume = child.text if (child.tag == 'number'): number = child.text if child.tag == 'year': if child.text is not None: year = int(child.text) if child.tag == 'pages': pageCount = pagecount(child.text) startPage = startpage(child.text) if child.tag == 'author': authorName = child.text if authorName is not None: authorName = authorName.strip() authorsOnPaper += 1 if authorName in facultydict: foundOneInDict = True # Any authors in our affiliations? if not foundOneInDict: continue # One of our conferences? if not foundArticle: continue # One of the papers we count? if not countPaper(confname, year, volume, number, startPage, pageCount): continue # If we get here, we have a winner. for child in node: if child.tag == 'author': authorName = child.text authorName = authorName.strip() if authorName in facultydict: # print "here we go",authorName, confname, authorsOnPaper, year logstring = authorName.encode( 'utf-8') + " ; " + confname + " " + str( year) + ": " + title.encode('utf-8') tmplist = authlogs.get(authorName, []) tmplist.append(logstring) authlogs[authorName] = tmplist interestingauthors[ authorName] = interestingauthors.get( authorName, 0) + 1 authorscores[( authorName, areaname, year)] = authorscores.get( (authorName, areaname, year), 0) + 1.0 authorscoresAdjusted[( authorName, areaname, year)] = authorscoresAdjusted.get( (authorName, areaname, year), 0) + 1.0 / authorsOnPaper return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
def parseDBLP(facultydict): authlogs = {} interestingauthors = {} authorscores = {} authorscoresAdjusted = {} coauthors = {} papersWritten = {} counter = 0 # with open('dblp.xml', mode='r') as f: with gzip.open('dblp.xml.gz') as f: oldnode = None for (event, node) in ElementTree.iterparse(f, events=['start', 'end']): if (oldnode is not None): oldnode.clear() oldnode = node foundArticle = True # include all venues # foundArticle = False inRange = False authorsOnPaper = 0 authorName = "" confname = "" year = -1 pageCount = -1 startPage = -1 foundOneInDict = False number = 0 volume = 0 if (node.tag == 'inproceedings' or node.tag == 'article'): # First, check if this is one of the conferences we are looking for. for child in node: if (child.tag == 'booktitle' or child.tag == 'journal'): confname = child.text if True: # INCLUDE ALL VENUES # if (confname in confdict): foundArticle = True if (child.tag == 'volume'): volume = child.text if (child.tag == 'number'): number = child.text if child.tag == 'year': if child.text is not None: year = int(child.text) if child.tag == 'pages': pageCount = pagecount(child.text) startPage = startpage(child.text) if child.tag == 'author': authorName = child.text if authorName is not None: authorName = authorName.strip() authorsOnPaper += 1 if authorName in facultydict: foundOneInDict = True if (not foundArticle): # Not one of our conferences. continue if confname is None: continue if not confname in confdict: areaname = "na" else: areaname = confdict[confname] # Check that dates are in the specified range. if ((year >= startyear) and (year <= endyear)): inRange = True if year == -1: # No year. continue tooFewPages = False if ((pageCount != -1) and (pageCount < pageCountThreshold)): tooFewPages = True exceptionConference = confname == 'SC' exceptionConference |= confname == 'SIGSOFT FSE' and year == 2012 exceptionConference |= confname == 'ACM Trans. Graph.' and int( volume) >= 26 and int(volume) <= 36 if exceptionConference: tooFewPages = False if (not inRange) or (not foundOneInDict) or tooFewPages: continue # If we got here, we have a winner. for child in node: if child.tag == 'author': authorName = child.text authorName = authorName.strip() if authorName in facultydict: print "here we go", authorName, confname, authorsOnPaper, year logstring = authorName.encode( 'utf-8') + " ; " + confname.encode( 'utf-8') + " " + str(year) tmplist = authlogs.get(authorName, []) tmplist.append(logstring) authlogs[authorName] = tmplist interestingauthors[ authorName] = interestingauthors.get( authorName, 0) + 1 authorscores[( authorName, areaname, year)] = authorscores.get( (authorName, areaname, year), 0) + 1.0 authorscoresAdjusted[( authorName, areaname, year)] = authorscoresAdjusted.get( (authorName, areaname, year), 0) + 1.0 / authorsOnPaper return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
def parseDBLP(): authlogs = {} interestingauthors = {} authorscores = {} authorscoresAdjusted = {} coauthors = {} papersWritten = {} allpapers = [] counter = 0 with gzip.open('generated/dblp/dblp.xml.gz', mode='r') as f: # with open('generated/foo/foo.xml', mode='r') as f: oldnode = None for (event, node) in ElementTree.iterparse(f, events=['start', 'end']): if (oldnode is not None): oldnode.clear() oldnode = node foundArticle = False authorsOnPaper = 0 authorName = "" confname = "" year = -1 pageCount = -1 startPage = -1 number = 0 volume = 0 paperinfo = dict() paperinfo['authors'] = list() if (node.tag == 'inproceedings' or node.tag == 'article'): paperinfo["dblp"] = node.get("key") # First, check if this is one of the conferences we are looking for. for child in node: if (child.tag == 'booktitle' or child.tag == 'journal'): confname = child.text if (confname in confdict): areaname = confdict[confname] paperinfo['area'] = areaname foundArticle = True if (confname in conf2confdict): paperinfo['venue'] = conf2confdict[confname] else: paperinfo['venue'] = confname if (child.tag == 'volume'): volume = child.text if (child.tag == 'number'): number = child.text if (child.tag == 'title'): # this way instead of child.text as the latter breaks if the title contains HTML # this way strips all XML/HTML tags from within the title paperinfo['title'] = ElementTree.tostring(child, method="text", encoding="utf-8").strip(" \n\t.") paperinfo['title'] = paperinfo['title'].decode('latin1') if child.tag == 'year': if child.text is not None: year = int(child.text) paperinfo['year'] = year if child.tag == 'pages': pageCount = pagecount(child.text) startPage = startpage(child.text) if child.tag == 'url': # sometimes this is None, even when there is clearly # a URL in the xml file. I cannot replicate this on a small # example, so I have no idea what is going on paperinfo["url"] = child.text if child.tag == 'author': authorName = child.text if authorName is not None: authorName = unicode(authorName).strip() paperinfo['authors'].append(authorName) authorsOnPaper += 1 # One of our conferences? if not foundArticle: continue # One of the papers we count? if not countPaper(confname, year, volume, number, startPage, pageCount): continue # sanity check for errors where no title shows up # (detects any recurrences of a bug where titles weren't included if contained XML if not paperinfo.get('title', False): print ElementTree.dump (node) print paperinfo raise Exception("No title") # If we get here, we have a winner. for child in node: if child.tag == 'author': authorName = child.text authorName = authorName.strip() if True: # print "here we go",authorName, confname, authorsOnPaper, year logstring = authorName.encode('utf-8') + " ; " + confname + " " + str(year) tmplist = authlogs.get(authorName, []) tmplist.append(logstring) authlogs[authorName] = tmplist interestingauthors[authorName] = interestingauthors.get(authorName, 0) + 1 authorscores[(authorName, areaname, year)] = authorscores.get((authorName, areaname, year), 0) + 1.0 authorscoresAdjusted[(authorName, areaname, year)] = authorscoresAdjusted.get((authorName, areaname, year), 0) + 1.0 / authorsOnPaper # record all paper info for logging allpapers.append(paperinfo) return (allpapers, interestingauthors, authorscores, authorscoresAdjusted, authlogs)