Пример #1
0
    def get_pdf_from_doi(self, doi, writefile, mode):
        '''
    Downloads and writes a PDF article to a file, given a DOI and operating mode

    :param doi: DOI string for the article we want to download
    :type doi: str

    :param writefile: file object to write to
    :type writefile: file

    :param mode: choose from {'crossref' | 'elsevier' | 'rsc' | 'springer' | 'ecs' | 'nature' | 'acs'}, depending on how we wish to access the file
    :type mode: str

    :returns: True on successful write, False otherwise
    :rtype: bool
    '''

        if mode == 'crossref':
            base_url = 'http://api.crossref.org/works/'
            api_url = base_url + doi

            headers = {'Accept': 'application/json'}

            try:
                response = json.loads(
                    requests.get(api_url,
                                 headers=headers,
                                 timeout=self.timeout_sec).text)
                pdf_url = response['message']['link'][0]['URL']
                app_type = str(response['message']['link'][0]['content-type'])

                if app_type in ['application/pdf', 'unspecified']:
                    headers['Accept'] = 'application/pdf'
                    r = requests.get(pdf_url, stream=True, headers=headers)
                    if r.status_code == 200:
                        for chunk in r.iter_content(2048):
                            writefile.write(chunk)
                        return True
            except:
                return False
            return False

        if mode == 'elsevier':
            try:
                pdf_url = 'http://api.elsevier.com/content/article/doi:' + doi + '?view=FULL'
                headers = {
                    'X-ELS-APIKEY': self.els_api_key,
                    'Accept': 'application/pdf'
                }

                r = requests.get(pdf_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                    return True
            except:
                # API download limit exceeded
                return False
            return False

        if mode == 'rsc':
            scraper = scrapers.RSC()
            scrape_url = 'http://dx.doi.org/' + doi
            download_url = None

            r = requests.get(scrape_url, timeout=self.timeout_sec)
            if r.status_code == 200:
                scraper.feed(r.content)

                if scraper.download_link is not None:
                    download_url = scraper.download_link

            if download_url is not None:
                headers = {'Accept': 'application/pdf'}
                r = requests.get(download_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    try:
                        for chunk in r.iter_content(2048):
                            writefile.write(chunk)
                        return True
                    except:
                        return False
            return False

        if mode == 'ecs':
            scraper = scrapers.ECS()
            scrape_url = 'http://dx.doi.org/' + doi
            download_url = None

            r = requests.get(scrape_url, timeout=self.timeout_sec)
            if r.status_code == 200:
                scraper.feed(r.content)

                if scraper.download_link is not None:
                    download_url = scraper.download_link

            if download_url is not None:
                headers = {'Accept': 'application/pdf'}
                r = requests.get(download_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    try:
                        for chunk in r.iter_content(2048):
                            writefile.write(chunk)
                        return True
                    except:
                        return False

            return False

        if mode == 'nature':
            scraper = scrapers.Nature()
            scrape_url = 'http://dx.doi.org/' + doi
            download_url = None

            r = requests.get(scrape_url, timeout=self.timeout_sec)
            if r.status_code == 200:
                scraper.feed(r.content)

                if scraper.download_link is not None:
                    download_url = scraper.download_link

            if download_url is not None:
                headers = {'Accept': 'application/pdf'}
                r = requests.get(download_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    try:
                        for chunk in r.iter_content(2048):
                            writefile.write(chunk)
                        return True
                    except:
                        return False

            return False

        if mode == 'acs':
            base_url = 'http://pubs.acs.org/doi/pdf/'
            api_url = base_url + doi

            try:
                headers = {
                    'Accept': 'application/pdf',
                    'User-agent': 'Mozilla/5.0'
                }
                r = requests.get(api_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                    return True
            except:
                return False
            return False

        if mode == 'springer':
            base_url = 'http://link.springer.com/content/pdf/'
            api_url = base_url + doi

            try:
                headers = {
                    'Accept': 'application/pdf',
                    'User-agent': 'Mozilla/5.0'
                }
                r = requests.get(api_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                    return True
            except:
                return False
            return False

        return False
Пример #2
0
    def get_pdf_from_doi(self, doi, writefile, mode):
        '''
    Downloads and writes a PDF article to a file, given a DOI and operating mode

    :param doi: DOI string for the article we want to download
    :type doi: str

    :param writefile: file object to write to
    :type writefile: file

    :param mode: either 'crossref' | 'elsevier' | 'rsc' | 'springer', depending on how we wish to access the file
    :type mode: str

    :returns: True on successful write, False otherwise
    :rtype: bool
    '''

        if mode == 'crossref':
            base_url = 'http://api.crossref.org/works/'
            api_url = base_url + doi

            try:
                response = json.loads(
                    requests.get(api_url, headers=self.headers).text)
                pdf_url = response['message']['link'][0]['URL']
                app_type = response['message']['link'][0]['content-type']

                if app_type == 'application/pdf':
                    r = requests.get(pdf_url,
                                     stream=True,
                                     headers=self.headers)
                    if r.status_code == 200:
                        for chunk in r.iter_content(2048):
                            writefile.write(chunk)
                    return True
            except:
                return False
        if mode == 'elsevier':
            if self.check_els_entitlement(doi):
                try:
                    pdf_url = 'http://api.elsevier.com/content/article/doi:' + doi + '?view=FULL'
                    self.headers['Accept'] = 'application/pdf'

                    r = requests.get(pdf_url,
                                     stream=True,
                                     headers=self.headers)
                    if r.status_code == 200:
                        for chunk in r.iter_content(2048):
                            writefile.write(chunk)
                        return True
                except:
                    # API download limit exceeded
                    return False

                return False

        if mode == 'rsc':
            scraper = scrapers.RSC()
            scrape_url = 'http://dx.doi.org/' + doi
            download_url = None

            r = requests.get(scrape_url, headers=self.headers)
            if r.status_code == 200:
                scraper.feed(r.content)

                if scraper.download_link is not None:
                    download_url = scraper.download_link

            if download_url is not None:
                r = requests.get(download_url,
                                 stream=True,
                                 headers=self.headers)
                if r.status_code == 200:
                    try:
                        for chunk in r.iter_content(2048):
                            writefile.write(chunk)
                        return True
                    except:
                        return False

            return False

        if mode == 'springer':
            base_url = 'http://link.springer.com/'
            api_url = base_url + doi + '.pdf'

            try:
                self.headers['Accept'] = 'application/pdf'
                r = requests.get(api_url, stream=True, headers=self.headers)
                if r.status_code == 200:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                        return True
            except:
                return False
            return False
Пример #3
0
    def get_html_from_doi(self, doi, writefile, mode):
        '''
    Downloads and writes an HTML article to a file, given a DOI and operating mode

    :param doi: DOI string for the article we want to download
    :type doi: str

    :param writefile: file object to write to
    :type writefile: file

    :param mode: either 'elsevier' | 'springer' | 'acs' | 'ecs' | 'rsc' | 'nature' | 'wiley' | 'aaas' | 'emerald', depending on how we wish to access the file
    :type mode: str

    :returns: True on successful write, False otherwise
    :rtype: bool
    '''

        if mode == 'elsevier':
            if self.check_els_entitlement(doi):
                try:
                    html_url = 'http://api.elsevier.com/content/article/doi:' + doi + '?view=FULL'
                    headers = {
                        'X-ELS-APIKEY': self.els_api_key,
                        'Accept': 'text/html'
                    }

                    r = requests.get(html_url,
                                     stream=True,
                                     headers=headers,
                                     timeout=self.timeout_sec)
                    if r.status_code == 200:
                        for chunk in r.iter_content(2048):
                            writefile.write(chunk)
                        return True
                except:
                    # API download limit exceeded
                    return False
            return False

        if mode == 'springer':
            base_url = 'http://link.springer.com/'
            api_url = base_url + doi + '.html'

            try:
                headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'}
                r = requests.get(api_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                    return True
            except:
                return False
            return False

        if mode == 'wiley':
            base_url = 'http://onlinelibrary.wiley.com/doi/'
            api_url = base_url + doi + '/full'

            try:
                headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'}
                r = requests.get(api_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                    return True
            except:
                return False
            return False

        if mode == 'acs':
            base_url = 'http://pubs.acs.org/doi/full/'
            api_url = base_url + doi

            try:
                headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'}
                r = requests.get(api_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                    return True
            except:
                return False
            return False

        if mode == 'emerald':
            base_url = 'http://www.emeraldinsight.com/doi/full/'
            api_url = base_url + doi

            try:
                headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'}
                r = requests.get(api_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                    return True
            except:
                return False
            return False

        if mode == 'rsc':
            scraper = scrapers.RSC()
            scrape_url = 'http://dx.doi.org/' + doi
            download_url = None

            r = requests.get(scrape_url, timeout=self.timeout_sec)
            if r.status_code == 200:
                scraper.feed(r.content)

                if scraper.download_link is not None:
                    download_url = scraper.download_link
                    download_url = download_url.replace(
                        'articlepdf', 'articlehtml')  #Override for HTML mode

            if download_url is not None:
                headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'}
                r = requests.get(download_url,
                                 stream=True,
                                 headers=headers,
                                 timeout=self.timeout_sec)
                if r.status_code == 200:
                    try:
                        for chunk in r.iter_content(2048):
                            writefile.write(chunk)
                        return True
                    except:
                        return False

            return False

        if mode == 'nature':
            download_url = 'http://dx.doi.org/' + doi

            headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'}
            r = requests.get(download_url,
                             stream=True,
                             headers=headers,
                             timeout=self.timeout_sec)
            if r.status_code == 200:
                try:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                    return True
                except:
                    return False
            return False

        return False

        if mode == 'aaas':

            headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'}

            article_url = 'http://dx.doi.org/' + doi
            resp = requests.get(article_url,
                                headers=headers,
                                timeout=self.timeout_sec)

            download_url = resp.url + doi + '.full'  #Capture fulltext from redirect

            r = requests.get(download_url,
                             stream=True,
                             headers=headers,
                             timeout=self.timeout_sec)
            if r.status_code == 200:
                try:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                    return True
                except:
                    return False
            return False

        return False

        if mode == 'ecs':
            headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'}

            article_url = 'http://dx.doi.org/' + doi
            resp = requests.get(article_url,
                                headers=headers,
                                timeout=self.timeout_sec)

            download_url = resp.url + doi + '.full'  #Capture fulltext from redirect

            r = requests.get(download_url,
                             stream=True,
                             headers=headers,
                             timeout=self.timeout_sec)
            if r.status_code == 200:
                try:
                    for chunk in r.iter_content(2048):
                        writefile.write(chunk)
                    return True
                except:
                    return False
            return False

        return False