Exemplo n.º 1
0
def lookupDoi(metaInfoDict, repeatCount=2, delaySecs=5):
    """ take author, vol, journal etc from metaInfoDict, query crossref 'links' and return DOI if found

    >>> lookupDoi({"authors":"M. Henrion, D. J. Mortlock, D. J. Hand, and A. Gandy", "title":"A Bayesian approach to star-galaxy classification", "journal":"Monthly Notices of the Royal Astronomical Society", "vol":"414", "issue":"4", "page":"2286", "year":"2011", "printIssn" : ""})
    u'10.1111/j.1365-2966.2010.18055.x'
    """

    # construct url
    mid = metaInfoDict
    logging.debug("Looking up DOI for article %s, %s with crossref links api" %
                  (mid["authors"], mid["title"]))
    freeFormCitFields = [
        mid["authors"],
        '"%s"' % mid["title"], mid["journal"], mid["year"],
        "vol. " + mid["vol"], "no. " + mid["issue"], "pp. " + mid["page"],
        mid["printIssn"]
    ]
    freeFormCitStr = ", ".join(freeFormCitFields)
    logging.debug("crossref.org query %s" % freeFormCitStr)
    url = "https://api.crossref.org/works"

    geturl = url + "?query=" + urllib2.quote(freeFormCitStr.encode('utf-8'))

    # send request
    httpResp = maxCommon.retryHttpRequest(geturl,
                                          None,
                                          delaySecs=delaySecs,
                                          repeatCount=repeatCount)
    if httpResp == None:
        logging.debug("HTTPError while sending crossref request")
        return None
    jsonStr = ""
    try:
        jsonStr = httpResp.read()
        httpResp.close()
    except:
        logging.debug("sslError while reading httpResp")
        return None
    xrdata = json.loads(jsonStr)

    # parse result
    if len(xrdata) == 0:
        logging.debug("Empty cross reply")
        return None

    try:
        items = xrdata["message"]["items"]
    except KeyError:
        logging.debug("Unexpected JSON content from crossref")
        return None
    if len(items) == 0:
        logging.debug("no results in crossref reply")
        return None

    firstRes = items[0]

    logging.debug("Best match from Crossref: %s" % firstRes)
    doi = firstRes["DOI"]
    logging.debug("Got DOI: %s" % doi)
    return doi
Exemplo n.º 2
0
def getOutlinks(pmid, preferPmc=False):
    """ use NCBI eutils to get outlinks for a pmid as a dict provider -> url  """
    logging.debug("%s: Getting outlink from pubmed" % (pmid))
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=%s&retmode=llinks&cmd=llinks" % pmid
    #logging.debug("getting %s" % url)
    #try:
    #except
    #logging.info(traceback.format_exc())
    #logging.info("Exception when downloading")
    #return None
    userAgent = 'User-Agent: Mozilla ([email protected], http://text.soe.ucsc.edu)'

    #html = urllib2.urlopen(req)
    html = maxCommon.retryHttpRequest(url, userAgent=userAgent)

    if html == None:
        return None

    outlinks = OrderedDict()
    provider = False
    fullText = False
    isPmc = False

    for line in html:
        if line.find("<ObjUrl>") != -1:
            url = ""
            fullText = False
            origPublisher = False
        if line.find("Attribute") != -1:
            attribute = stripTag(line)
            if attribute.lower(
            ) == "full-text online" or attribute == "full-text pdf":
                fullText = True
        if line.find("<NameAbbr>") != -1 and fullText and origPublisher:
            db = stripTag(line)
            outlinks[db] = url
        if line.find("publishers/providers") != -1:
            origPublisher = True
        if line.find("<Provider>") != -1:
            provider = True
        if line.find("</Provider>") != -1:
            provider = False
        if line.find("<DbFrom>") != -1:
            db = stripTag(line)
        if line.find("<Url>") != -1 and not provider:
            url = line
            url = stripTag(url).replace("&amp;", "&")  # XX strange!
            url = stripTag(url).replace("&lt;", "<")
            url = stripTag(url).replace("&gt;", ">")
            if "www.ncbi.nlm.nih.gov" in url and "/pmc/" in url and preferPmc:
                # override all other links
                outlinks.clear()
                outlinks["pmc"] = url

    logging.debug("%s: Found outlinks %s" % (pmid, str(outlinks)))
    return outlinks
Exemplo n.º 3
0
def getOutlinks(pmid, preferPmc=False):
    """ use NCBI eutils to get outlinks for a pmid as a dict provider -> url  """
    logging.debug("%s: Getting outlink from pubmed" % (pmid))
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=%s&retmode=llinks&cmd=llinks" % pmid
    #logging.debug("getting %s" % url)
    #try:
    #except
    #logging.info(traceback.format_exc())
    #logging.info("Exception when downloading")
    #return None
    userAgent = 'User-Agent: Mozilla ([email protected], http://text.soe.ucsc.edu)'

    #html = urllib2.urlopen(req)
    html = maxCommon.retryHttpRequest(url, userAgent=userAgent)

    if html==None:
        return None

    outlinks = OrderedDict()
    provider = False
    fullText = False
    isPmc = False

    for line in html:
        if line.find("<ObjUrl>") != -1:
            url=""
            fullText=False
            origPublisher=False
        if line.find("Attribute") != -1:
            attribute=stripTag(line)
            if attribute.lower()=="full-text online" or attribute=="full-text pdf":
                fullText=True
        if line.find("<NameAbbr>") != -1 and fullText and origPublisher:
            db = stripTag(line)
            outlinks[db]=url
        if line.find("publishers/providers")!=-1:
            origPublisher=True
        if line.find("<Provider>") != -1:
            provider=True
        if line.find("</Provider>") != -1:
            provider=False
        if line.find("<DbFrom>") != -1:
            db = stripTag(line)
        if line.find("<Url>") != -1 and not provider:
            url = line
            url = stripTag(url).replace("&amp;", "&") # XX strange!
            url = stripTag(url).replace("&lt;", "<")
            url = stripTag(url).replace("&gt;", ">")
            if "www.ncbi.nlm.nih.gov" in url and "/pmc/" in url and preferPmc:
                # override all other links
                outlinks.clear()
                outlinks["pmc"] = url

    logging.debug("%s: Found outlinks %s" % (pmid, str(outlinks)))
    return outlinks
Exemplo n.º 4
0
def lookupDoi(metaInfoDict, repeatCount=2, delaySecs=5):
    """ take author, vol, journal etc from metaInfoDict, query crossref 'links' and return DOI if found

    >>> lookupDoi({"authors":"M. Henrion, D. J. Mortlock, D. J. Hand, and A. Gandy", "title":"A Bayesian approach to star-galaxy classification", "journal":"Monthly Notices of the Royal Astronomical Society", "vol":"414", "issue":"4", "page":"2286", "year":"2011", "printIssn" : ""})
    u'10.1111/j.1365-2966.2010.18055.x'
    """

    # construct url
    mid = metaInfoDict
    logging.debug("Looking up DOI for article %s, %s with crossref links api" % (mid["authors"], mid["title"]))
    freeFormCitFields = [mid["authors"], '"%s"' % mid["title"], mid["journal"],mid["year"], "vol. "+mid["vol"], "no. "+ mid["issue"], "pp. "+mid["page"],  mid["printIssn"]]
    freeFormCitStr = ", ".join(freeFormCitFields)
    logging.debug("crossref.org query %s" % freeFormCitStr)
    url = "https://api.crossref.org/works"

    geturl =  url + "?query=" + urllib2.quote(freeFormCitStr.encode('utf-8'))

    # send request
    httpResp = maxCommon.retryHttpRequest(geturl, None, delaySecs=delaySecs, repeatCount=repeatCount)
    if httpResp==None:
        logging.debug("HTTPError while sending crossref request")
        return None
    jsonStr = ""
    try:
        jsonStr = httpResp.read()
        httpResp.close()
    except:
        logging.debug("sslError while reading httpResp")
        return None
    xrdata = json.loads(jsonStr)

    # parse result
    if len(xrdata)==0:
        logging.debug("Empty cross reply")
        return None

    try:
        items = xrdata["message"]["items"]
    except KeyError:
        logging.debug("Unexpected JSON content from crossref")
        return None
    if len(items) == 0:
        logging.debug("no results in crossref reply")
        return None

    firstRes = items[0]

    logging.debug("Best match from Crossref: %s" % firstRes)
    doi = firstRes["DOI"]
    logging.debug("Got DOI: %s" % doi)
    return doi
Exemplo n.º 5
0
def lookupDoi(metaInfoDict, repeatCount=2, delaySecs=5):
    """ take author, vol, journal etc from metaInfoDict, query crossref 'links' and return DOI if found 

    >>> lookupDoi({"authors":"M. Henrion, D. J. Mortlock, D. J. Hand, and A. Gandy", "title":"A Bayesian approach to star-galaxy classification", "journal":"Monthly Notices of the Royal Astronomical Society", "vol":"414", "issue":"4", "page":"2286", "year":"2011", "printIssn" : ""})
    u'10.1111/j.1365-2966.2010.18055.x'
    """

    # construct url
    mid = metaInfoDict
    logging.debug("Looking up DOI for article %s, %s with crossref links api" % (mid["authors"], mid["title"]))
    freeFormCitFields = [mid["authors"], '"%s"' % mid["title"], mid["journal"],mid["year"], "vol. "+mid["vol"], "no. "+ mid["issue"], "pp. "+mid["page"],  mid["printIssn"]]
    freeFormCitStr = ", ".join(freeFormCitFields)
    queryData = {"q" : freeFormCitStr}
    url = "http://search.crossref.org/links?" 
    jsonParam = json.dumps([freeFormCitStr])
    logging.debug("JSON string %s" % jsonParam)
    queryParam = {"q" : jsonParam}

    # send request
    httpResp = maxCommon.retryHttpRequest(url, jsonParam, delaySecs=delaySecs, repeatCount=repeatCount)
    if httpResp==None:
        logging.debug("HTTPError while sending crossref request")
        return None

    jsonStr = httpResp.read()
    xrdata = json.loads(jsonStr)

    # parse result
    if len(xrdata)==0:
        logging.debug("Empty cross reply")
        return None

    if not xrdata["query_ok"]:
        logging.debug("Query error from crossref")
        return None
    elif "results" not in xrdata or len(xrdata["results"])<1:
        logging.debug("no results in crossref reply")
        return None

    firstRes = xrdata["results"][0]
    if not firstRes["match"]:
        logging.debug("no match in crossref resply")
        return None
        
    logging.debug("Best match from Crossref: %s" % firstRes)
    doi = firstRes["doi"]
    doi = doi.replace("http://dx.doi.org/","") # crossref now always adds the url, strip it
    logging.debug("Got DOI: %s" % doi)
    return doi
Exemplo n.º 6
0
def getOutlinks(pmid):
    """ use NCBI eutils to get outlinks for a pmid as a list """
    logging.debug("%s: Getting outlink from pubmed" % (pmid))
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=%s&retmode=llinks&cmd=llinks" % pmid
    # try:
    req = urllib2.Request(url)
    # except
    # logging.info(traceback.format_exc())
    # logging.info("Exception when downloading")
    # return None
    req.add_header("User-Agent", "User-Agent: Mozilla ([email protected], http://text.soe.ucsc.edu)")

    # html = urllib2.urlopen(req)
    html = maxCommon.retryHttpRequest(req)

    if html == None:
        return None

    outlinks = {}
    provider = False
    fullText = False

    for line in html:
        if line.find("<ObjUrl>") != -1:
            url = ""
            fullText = False
            origPublisher = False
        if line.find("Attribute") != -1:
            attribute = stripTag(line)
            if attribute.lower() == "full-text online" or attribute == "full-text pdf":
                fullText = True
        if line.find("<NameAbbr>") != -1 and fullText and origPublisher:
            db = stripTag(line)
            outlinks[db] = url
        if line.find("publishers/providers") != -1:
            origPublisher = True
        if line.find("<Provider>") != -1:
            provider = True
        if line.find("</Provider>") != -1:
            provider = False
        if line.find("<DbFrom>") != -1:
            db = stripTag(line)
        if line.find("<Url>") != -1 and not provider:
            url = line
            url = stripTag(url).replace("&amp;", "&")  # XX strange!
            url = stripTag(url).replace("&lt;", "<")
            url = stripTag(url).replace("&gt;", ">")
    logging.debug("%s: Found outlinks %s" % (pmid, str(outlinks)))
    return outlinks