def lookupDoi(metaInfoDict, repeatCount=2, delaySecs=5): """ take author, vol, journal etc from metaInfoDict, query crossref 'links' and return DOI if found >>> lookupDoi({"authors":"M. Henrion, D. J. Mortlock, D. J. Hand, and A. Gandy", "title":"A Bayesian approach to star-galaxy classification", "journal":"Monthly Notices of the Royal Astronomical Society", "vol":"414", "issue":"4", "page":"2286", "year":"2011", "printIssn" : ""}) u'10.1111/j.1365-2966.2010.18055.x' """ # construct url mid = metaInfoDict logging.debug("Looking up DOI for article %s, %s with crossref links api" % (mid["authors"], mid["title"])) freeFormCitFields = [ mid["authors"], '"%s"' % mid["title"], mid["journal"], mid["year"], "vol. " + mid["vol"], "no. " + mid["issue"], "pp. " + mid["page"], mid["printIssn"] ] freeFormCitStr = ", ".join(freeFormCitFields) logging.debug("crossref.org query %s" % freeFormCitStr) url = "https://api.crossref.org/works" geturl = url + "?query=" + urllib2.quote(freeFormCitStr.encode('utf-8')) # send request httpResp = maxCommon.retryHttpRequest(geturl, None, delaySecs=delaySecs, repeatCount=repeatCount) if httpResp == None: logging.debug("HTTPError while sending crossref request") return None jsonStr = "" try: jsonStr = httpResp.read() httpResp.close() except: logging.debug("sslError while reading httpResp") return None xrdata = json.loads(jsonStr) # parse result if len(xrdata) == 0: logging.debug("Empty cross reply") return None try: items = xrdata["message"]["items"] except KeyError: logging.debug("Unexpected JSON content from crossref") return None if len(items) == 0: logging.debug("no results in crossref reply") return None firstRes = items[0] logging.debug("Best match from Crossref: %s" % firstRes) doi = firstRes["DOI"] logging.debug("Got DOI: %s" % doi) return doi
def getOutlinks(pmid, preferPmc=False): """ use NCBI eutils to get outlinks for a pmid as a dict provider -> url """ logging.debug("%s: Getting outlink from pubmed" % (pmid)) url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=%s&retmode=llinks&cmd=llinks" % pmid #logging.debug("getting %s" % url) #try: #except #logging.info(traceback.format_exc()) #logging.info("Exception when downloading") #return None userAgent = 'User-Agent: Mozilla ([email protected], http://text.soe.ucsc.edu)' #html = urllib2.urlopen(req) html = maxCommon.retryHttpRequest(url, userAgent=userAgent) if html == None: return None outlinks = OrderedDict() provider = False fullText = False isPmc = False for line in html: if line.find("<ObjUrl>") != -1: url = "" fullText = False origPublisher = False if line.find("Attribute") != -1: attribute = stripTag(line) if attribute.lower( ) == "full-text online" or attribute == "full-text pdf": fullText = True if line.find("<NameAbbr>") != -1 and fullText and origPublisher: db = stripTag(line) outlinks[db] = url if line.find("publishers/providers") != -1: origPublisher = True if line.find("<Provider>") != -1: provider = True if line.find("</Provider>") != -1: provider = False if line.find("<DbFrom>") != -1: db = stripTag(line) if line.find("<Url>") != -1 and not provider: url = line url = stripTag(url).replace("&", "&") # XX strange! url = stripTag(url).replace("<", "<") url = stripTag(url).replace(">", ">") if "www.ncbi.nlm.nih.gov" in url and "/pmc/" in url and preferPmc: # override all other links outlinks.clear() outlinks["pmc"] = url logging.debug("%s: Found outlinks %s" % (pmid, str(outlinks))) return outlinks
def getOutlinks(pmid, preferPmc=False): """ use NCBI eutils to get outlinks for a pmid as a dict provider -> url """ logging.debug("%s: Getting outlink from pubmed" % (pmid)) url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=%s&retmode=llinks&cmd=llinks" % pmid #logging.debug("getting %s" % url) #try: #except #logging.info(traceback.format_exc()) #logging.info("Exception when downloading") #return None userAgent = 'User-Agent: Mozilla ([email protected], http://text.soe.ucsc.edu)' #html = urllib2.urlopen(req) html = maxCommon.retryHttpRequest(url, userAgent=userAgent) if html==None: return None outlinks = OrderedDict() provider = False fullText = False isPmc = False for line in html: if line.find("<ObjUrl>") != -1: url="" fullText=False origPublisher=False if line.find("Attribute") != -1: attribute=stripTag(line) if attribute.lower()=="full-text online" or attribute=="full-text pdf": fullText=True if line.find("<NameAbbr>") != -1 and fullText and origPublisher: db = stripTag(line) outlinks[db]=url if line.find("publishers/providers")!=-1: origPublisher=True if line.find("<Provider>") != -1: provider=True if line.find("</Provider>") != -1: provider=False if line.find("<DbFrom>") != -1: db = stripTag(line) if line.find("<Url>") != -1 and not provider: url = line url = stripTag(url).replace("&", "&") # XX strange! url = stripTag(url).replace("<", "<") url = stripTag(url).replace(">", ">") if "www.ncbi.nlm.nih.gov" in url and "/pmc/" in url and preferPmc: # override all other links outlinks.clear() outlinks["pmc"] = url logging.debug("%s: Found outlinks %s" % (pmid, str(outlinks))) return outlinks
def lookupDoi(metaInfoDict, repeatCount=2, delaySecs=5): """ take author, vol, journal etc from metaInfoDict, query crossref 'links' and return DOI if found >>> lookupDoi({"authors":"M. Henrion, D. J. Mortlock, D. J. Hand, and A. Gandy", "title":"A Bayesian approach to star-galaxy classification", "journal":"Monthly Notices of the Royal Astronomical Society", "vol":"414", "issue":"4", "page":"2286", "year":"2011", "printIssn" : ""}) u'10.1111/j.1365-2966.2010.18055.x' """ # construct url mid = metaInfoDict logging.debug("Looking up DOI for article %s, %s with crossref links api" % (mid["authors"], mid["title"])) freeFormCitFields = [mid["authors"], '"%s"' % mid["title"], mid["journal"],mid["year"], "vol. "+mid["vol"], "no. "+ mid["issue"], "pp. "+mid["page"], mid["printIssn"]] freeFormCitStr = ", ".join(freeFormCitFields) logging.debug("crossref.org query %s" % freeFormCitStr) url = "https://api.crossref.org/works" geturl = url + "?query=" + urllib2.quote(freeFormCitStr.encode('utf-8')) # send request httpResp = maxCommon.retryHttpRequest(geturl, None, delaySecs=delaySecs, repeatCount=repeatCount) if httpResp==None: logging.debug("HTTPError while sending crossref request") return None jsonStr = "" try: jsonStr = httpResp.read() httpResp.close() except: logging.debug("sslError while reading httpResp") return None xrdata = json.loads(jsonStr) # parse result if len(xrdata)==0: logging.debug("Empty cross reply") return None try: items = xrdata["message"]["items"] except KeyError: logging.debug("Unexpected JSON content from crossref") return None if len(items) == 0: logging.debug("no results in crossref reply") return None firstRes = items[0] logging.debug("Best match from Crossref: %s" % firstRes) doi = firstRes["DOI"] logging.debug("Got DOI: %s" % doi) return doi
def lookupDoi(metaInfoDict, repeatCount=2, delaySecs=5): """ take author, vol, journal etc from metaInfoDict, query crossref 'links' and return DOI if found >>> lookupDoi({"authors":"M. Henrion, D. J. Mortlock, D. J. Hand, and A. Gandy", "title":"A Bayesian approach to star-galaxy classification", "journal":"Monthly Notices of the Royal Astronomical Society", "vol":"414", "issue":"4", "page":"2286", "year":"2011", "printIssn" : ""}) u'10.1111/j.1365-2966.2010.18055.x' """ # construct url mid = metaInfoDict logging.debug("Looking up DOI for article %s, %s with crossref links api" % (mid["authors"], mid["title"])) freeFormCitFields = [mid["authors"], '"%s"' % mid["title"], mid["journal"],mid["year"], "vol. "+mid["vol"], "no. "+ mid["issue"], "pp. "+mid["page"], mid["printIssn"]] freeFormCitStr = ", ".join(freeFormCitFields) queryData = {"q" : freeFormCitStr} url = "http://search.crossref.org/links?" jsonParam = json.dumps([freeFormCitStr]) logging.debug("JSON string %s" % jsonParam) queryParam = {"q" : jsonParam} # send request httpResp = maxCommon.retryHttpRequest(url, jsonParam, delaySecs=delaySecs, repeatCount=repeatCount) if httpResp==None: logging.debug("HTTPError while sending crossref request") return None jsonStr = httpResp.read() xrdata = json.loads(jsonStr) # parse result if len(xrdata)==0: logging.debug("Empty cross reply") return None if not xrdata["query_ok"]: logging.debug("Query error from crossref") return None elif "results" not in xrdata or len(xrdata["results"])<1: logging.debug("no results in crossref reply") return None firstRes = xrdata["results"][0] if not firstRes["match"]: logging.debug("no match in crossref resply") return None logging.debug("Best match from Crossref: %s" % firstRes) doi = firstRes["doi"] doi = doi.replace("http://dx.doi.org/","") # crossref now always adds the url, strip it logging.debug("Got DOI: %s" % doi) return doi
def getOutlinks(pmid): """ use NCBI eutils to get outlinks for a pmid as a list """ logging.debug("%s: Getting outlink from pubmed" % (pmid)) url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=%s&retmode=llinks&cmd=llinks" % pmid # try: req = urllib2.Request(url) # except # logging.info(traceback.format_exc()) # logging.info("Exception when downloading") # return None req.add_header("User-Agent", "User-Agent: Mozilla ([email protected], http://text.soe.ucsc.edu)") # html = urllib2.urlopen(req) html = maxCommon.retryHttpRequest(req) if html == None: return None outlinks = {} provider = False fullText = False for line in html: if line.find("<ObjUrl>") != -1: url = "" fullText = False origPublisher = False if line.find("Attribute") != -1: attribute = stripTag(line) if attribute.lower() == "full-text online" or attribute == "full-text pdf": fullText = True if line.find("<NameAbbr>") != -1 and fullText and origPublisher: db = stripTag(line) outlinks[db] = url if line.find("publishers/providers") != -1: origPublisher = True if line.find("<Provider>") != -1: provider = True if line.find("</Provider>") != -1: provider = False if line.find("<DbFrom>") != -1: db = stripTag(line) if line.find("<Url>") != -1 and not provider: url = line url = stripTag(url).replace("&", "&") # XX strange! url = stripTag(url).replace("<", "<") url = stripTag(url).replace(">", ">") logging.debug("%s: Found outlinks %s" % (pmid, str(outlinks))) return outlinks