Пример #1
0
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'
    }
    landingpage = requests.get(url, headers=headers)
    soup = BeautifulSoup(landingpage.text)
    pdflink = soup.find('a', {'id': 'pdfLink'})
    if pdflink is not None:
        pdflink = pdflink['href']
    else:
        # Try something else.
        pdflink = soup.find('div', {'id': 'article-download'})
        if pdflink is None:
            print 'Could not find PDF link on ScienceDirect page.'
            return False
        pdflink = 'http:' + pdflink.a['href']
    # Sciencedirect requires cookies for pdf downloads.
    pdf = requests.get(pdflink, cookies=landingpage.cookies,
                       headers=headers).content
    if pdf[:4] != '%PDF':
        print 'You do not seem to have the permission to view this pdf.'
        return False
    with open(filename, 'wb') as f:
        f.write(pdf)
    return True


import base
base.register_module('http://www\.sciencedirect\.com/science/article/.*', {
    'name': 'sciencedirect',
    'download_pdf': download_pdf,
})
Пример #2
0
def download_pdf(url, filename):
    headers = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0"}
    landingpage = requests.get(url, headers=headers)
    soup = BeautifulSoup(landingpage.text)
    pdflink = soup.find("a", {"id": "pdfLink"})
    if pdflink is not None:
        pdflink = pdflink["href"]
    else:
        # Try something else.
        pdflink = soup.find("div", {"id": "article-download"})
        if pdflink is None:
            print "Could not find PDF link on ScienceDirect page."
            return False
        pdflink = "http:" + pdflink.a["href"]
    # Sciencedirect requires cookies for pdf downloads.
    pdf = requests.get(pdflink, cookies=landingpage.cookies, headers=headers).content
    if pdf[:4] != "%PDF":
        print "You do not seem to have the permission to view this pdf."
        return False
    with open(filename, "wb") as f:
        f.write(pdf)
    return True


import base

base.register_module(
    "http://www\.sciencedirect\.com/science/article/.*", {"name": "sciencedirect", "download_pdf": download_pdf}
)
Пример #3
0
import requests, re

download_pdf_regex = re.compile(
    '\s*<a name="FullTextPDF" title="FullText PDF" href="([^"]*).*')


def download_pdf(url, filename):
    headers = {
        'User-agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'
    }
    landingpage = requests.get(url, headers=headers)
    result = download_pdf_regex.search(landingpage.text)
    if result is None:
        return False
    fulltext_url = 'http://dl.acm.org/' + result.group(1)
    # ACM requires cookies for pdf downloads.
    pdf = requests.get(fulltext_url,
                       cookies=landingpage.cookies,
                       headers=headers).content
    with open(filename, 'wb') as f:
        f.write(pdf)
    return True


import base
base.register_module('http://dl\.acm\.org/citation\.cfm.*', {
    'name': 'ACM',
    'download_pdf': download_pdf,
})
Пример #4
0
        ('ctl00$ctl14$SearchControl$BasicPublicationTextBox', ''),
        ('ctl00$ctl14$SearchControl$BasicVolumeTextBox', ''),
        ('ctl00$ctl14$SearchControl$BasicIssueTextBox', ''),
        ('ctl00$ctl14$SearchControl$BasicPageTextBox', ''),
        ('ctl00$ContentPrimary$ctl00$ctl00$Export', 'CitationOnlyRadioButton'),
        ('ctl00$ContentPrimary$ctl00$ctl00$CitationManagerDropDownList',
         'BibTex'),
        ('ctl00$ContentPrimary$ctl00$ctl00$ExportCitationButton',
         'Export+Citation'), ('__EVENTVALIDATION', eventvalidation)
    ])
    return urllib.urlretrieve(url, filename, data=data) is not None


def download_pdf_chapter(url, filename):
    return urllib.urlretrieve(
        url.replace('/chapter/', '/content/pdf/', 1) + '.pdf',
        filename) is not None


import base
base.register_module(
    'http://www\.springerlink\.com/content/.*', {
        'name': 'springerlink',
        'download_pdf': download_pdf,
        'download_bib': download_bib,
    })
base.register_module('http://link\.springer\.com/chapter/.*', {
    'name': 'springerlink_chapter',
    'download_pdf': download_pdf_chapter,
})
Пример #5
0
import requests, re

download_pdf_regex = re.compile('\s*<a name="FullTextPDF" title="FullText PDF" href="([^"]*).*')

def download_pdf(url, filename):
    headers = { 'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0' }
    landingpage = requests.get(url, headers=headers)
    result = download_pdf_regex.search(landingpage.text)
    if result is None:
        return False
    fulltext_url = 'http://dl.acm.org/' + result.group(1)
    # ACM requires cookies for pdf downloads.
    pdf = requests.get(fulltext_url, cookies=landingpage.cookies, headers=headers).content
    with open(filename, 'wb') as f:
        f.write(pdf)
    return True

import base
base.register_module('http://dl\.acm\.org/citation\.cfm.*',
                     {'name': 'ACM',
                      'download_pdf': download_pdf,
                      })
Пример #6
0
import requests

def download_pdf(url, filename):
    headers = { 'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0' }
    pdf = requests.get(url.replace('/abs/', '/pdf/'), headers=headers).content
    with open(filename, 'wb') as f:
        f.write(pdf)
    return True

import base
base.register_module('http://arxiv.org/abs/.*',
                     {'name': 'arXiv',
                      'download_pdf': download_pdf,
                      })