示例#1
0
def http_download(download_url, outfile, proxy_url=None, proxy_port=None):

    if proxy_url:
        proxy = "{}:{}".format(proxy_url, proxy_port)
        mainlog.info("Using a proxy : {}".format(proxy))

        urlopener = build_opener(ProxyHandler({
            'https': proxy,
            'http': proxy
        }), HTTPRedirectHandler())
    else:
        mainlog.info("Not using a proxy")
        urlopener = build_opener(HTTPHandler(), HTTPSHandler(),
                                 HTTPRedirectHandler())

    urlopener.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0'
    )]

    datasource = urlopener.open(download_url)

    out = open(outfile, 'wb')
    while True:
        d = datasource.read(8192)
        # self.logger.debug("Downloaded {} bytes".format(len(d)))
        if not d:
            break
        else:
            out.write(d)
            out.flush()
    out.close()
    datasource.close()
示例#2
0
def auth(*, email, password, client_id, scope):
    def split_key_value(kv_pair):
        kv = kv_pair.split("=")
        return kv[0], kv[1]

    # Authorization form
    def auth_user(email, password, client_id, scope, opener):
        response = opener.open(
            "http://oauth.vk.com/oauth/authorize?" + \
            "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \
            "client_id=%s&scope=%s" % (client_id, ",".join(scope))
            )
        doc = response.read().decode('utf-8')
        parser = AuthFormParser()
        parser.feed(doc)
        parser.close()
        if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \
          "email" not in parser.params:
            raise RuntimeError("Something wrong")
        parser.params["email"] = email
        parser.params["pass"] = password
        if parser.method == "POST":
            response = opener.open(parser.url,
                                   urlencode(parser.params).encode('ascii'))
        else:
            raise NotImplementedError("Method '%s'" % parser.method)
        return response.read(), response.geturl()

    # Permission request form
    def give_access(doc, opener):
        parser = AuthFormParser()
        parser.feed(doc)
        parser.close()
        if not parser.form_parsed or parser.url is None:
            raise RuntimeError("Something wrong")
        if parser.method == "POST":
            response = opener.open(parser.url,
                                   urlencode(parser.params).encode('ascii'))
        else:
            raise NotImplementedError("Method '%s'" % parser.method)
        return response.geturl()

    if not isinstance(scope, list):
        scope = [scope]
    opener = build_opener(HTTPCookieProcessor(http.cookiejar.CookieJar()),
                          HTTPRedirectHandler())
    doc, url = auth_user(email, password, client_id, scope, opener)
    if urlparse(url).path != "/blank.html":
        # Need to give access to requested scope
        url = give_access(doc.decode('utf-8'), opener)
    if urlparse(url).path != "/blank.html":
        raise RuntimeError("Expected success here")
    answer = dict(
        split_key_value(kv_pair)
        for kv_pair in urlparse(url).fragment.split("&"))
    if "access_token" not in answer or "user_id" not in answer:
        raise RuntimeError("Missing some values in answer")

    return answer["access_token"], answer["user_id"]
示例#3
0
 def setup_method(self, method):
     self.cookies = CookieJar()
     self.opener = build_opener(HTTPRedirectHandler(),
                                HTTPHandler(debuglevel=0),
                                HTTPSHandler(debuglevel=0),
                                HTTPCookieProcessor(self.cookies))
     self.application_process = Process(target=main)
     self.application_process.start()
示例#4
0
    def download(self, url, error_message, timeout, tries):
        http_proxy = self.setting.http_proxy
        https_proxy = self.setting.https_proxy
        if http_proxy or https_proxy:
            proxies = {}
            if http_proxy:
                proxies['http'] = http_proxy
                if not https_proxy:
                    proxies['https'] = http_proxy
            if https_proxy:
                proxies['https'] = https_proxy
            proxy_handler = ProxyHandler(proxies)
        else:
            proxy_handler = ProxyHandler()
        handlers = [proxy_handler, HTTPRedirectHandler()]

        # secure_url_match = re.match('^https://([^/]+)', url)
        # if secure_url_match != None:
        #   secure_domain = secure_url_match.group(1)
        #   bundle_path = self.check_certs(secure_domain, timeout)
        #   if not bundle_path:
        #       return False
        #   handlers.append(VerifiedHTTPSHandler(ca_certs=bundle_path))
        opener = build_opener(*handlers)

        while tries > 0:
            tries -= 1
            try:
                request = Request(
                    url, headers={"User-Agent": "OmniMarkup Downloader"})
                http_file = opener.open(request, timeout=timeout)
                return http_file.read()

            except HTTPException as e:
                log.warning('%s HTTP exception %s (%s) downloading %s.',
                            error_message, e.__class__.__name__, str(e), url)

            except HTTPError as e:
                # Bitbucket and Github ratelimit using 503 a decent amount
                if str(e.code) == '503':
                    log.warning(
                        'Downloading %s was rate limited, trying again', url)
                    continue
                log.warning('%s HTTP error %s downloading %s.', error_message,
                            str(e.code), url)

            except URLError as e:
                # Bitbucket and Github timeout a decent amount
                if str(e.reason) == 'The read operation timed out' or \
                        str(e.reason) == 'timed out':
                    log.warning('Downloading %s timed out, trying again', url)
                    continue
                log.warning('%s URL error %s downloading %s.', error_message,
                            str(e.reason), url)
            break
        return False
示例#5
0
 def open(request):
     request = request_vim_to_python(request)
     rhandler = HTTPRedirectHandler()
     rhandler.max_redirections = request['max_redirect']
     opener = build_opener(rhandler)
     if request['username']:
         passmgr = HTTPPasswordMgrWithDefaultRealm()
         passmgr.add_password(
             None,
             request['url'],
             request['username'],
             request['password'],
         )
         opener.add_handler(HTTPBasicAuthHandler(passmgr))
         opener.add_handler(HTTPDigestAuthHandler(passmgr))
     req = Request(
         url=request['url'],
         data=request['data'],
         headers=request['headers'],
         method=request['method'],
     )
     if request['gzip_decompress']:
         req.add_header('Accept-encoding', 'gzip')
     try:
         res = retry(tries=request['retry'])(opener.open)(
             req, timeout=request['timeout'])
     except HTTPError as e:
         res = e
     if not hasattr(res, 'version'):
         # urllib2 does not have 'version' field
         import httplib
         res.version = httplib.HTTPConnection._http_vsn
     response_status = "HTTP/%s %d %s\n" % (
         '1.1' if res.version == 11 else '1.0',
         res.code,
         res.msg,
     )
     response_headers = str(res.headers)
     response_body = res.read()
     if (request['gzip_decompress']
             and res.headers.get('Content-Encoding') == 'gzip'):
         response_body = gzip_decompress(response_body)
     if hasattr(res.headers, 'get_content_charset'):
         # Python 3
         response_encoding = res.headers.get_content_charset()
     else:
         # Python 2
         response_encoding = res.headers.getparam('charset')
     response_body = response_body.decode(response_encoding)
     return (
         request['url'],
         response_status + response_headers,
         response_body,
     )
示例#6
0
文件: session.py 项目: afcarl/sputnik
    def __init__(self, data_path, **kwargs):
        if not validation.is_data_path(data_path):
            raise Exception('invalid data_path: %s' % data_path)

        self.cookie_jar = MozillaCookieJar(
            os.path.join(data_path, default.COOKIES_FILENAME))
        try:
            self.cookie_jar.load()
        except EnvironmentError:
            pass

        self.opener = build_opener(HTTPRedirectHandler(),
                                   HTTPCookieProcessor(self.cookie_jar))

        super(Session, self).__init__(**kwargs)
示例#7
0
 def login(self):
     if self.type == 'geonetwork':
         url = "%sgeonetwork/srv/en/xml.user.login" % self.base
         headers = {
             "Content-Type": "application/x-www-form-urlencoded",
             "Accept": "text/plain"
         }
         post = urlencode({
             "username": self.user,
             "password": self.password
         })
         request = Request(url, post, headers)
         self.opener = build_opener(HTTPCookieProcessor(),
                                    HTTPRedirectHandler())
         response = self.opener.open(request)
         doc = dlxml.fromstring(response.read())
         assert doc.tag == 'ok', "GeoNetwork login failed!"
         self.connected = True
示例#8
0
 def __init__(self, proxy=None):
     global USER_AGENT
     self.redirh = HTTPRedirectHandler()
     self.cookie = HTTPCookieProcessor()
     self.rawopen = build_opener(self.redirh, self.cookie)
     if proxy is None or self.no_proxy:
         self.opener = self.rawopen
     elif proxy == 'auto':
         # proxy.uku.im:8888
         #self.proxyh = ProxyHandler({'http': "http://211.155.86.25:8888"})
         #self.proxyh = ProxyHandler({'http': "proxy.uku.im:8888"})
         self.proxyh = ProxyHandler({'http': "https://secure.uku.im:8443"})
         #self.proxyh = ProxyHandler({'http': "https://proxy.uku.im:443"})
         self.opener = build_opener(self.proxyh, self.redirh, self.cookie)
     else:
         self.proxyh = ProxyHandler(proxy)
         self.opener = build_opener(self.proxyh, self.redirh, self.cookie)
     self.extra_headers = {"User-Agent": USER_AGENT}
示例#9
0
    def __init__(self, args):
        """ Start up... """
        self.args = args
        self.cj = http.cookiejar.MozillaCookieJar(COOKIES_FILENAME)
        if os.access(COOKIES_FILENAME, os.F_OK):
            self.cj.load(os.getcwd() + "/" + COOKIES_FILENAME)
        self.opener = build_opener(HTTPRedirectHandler(),
                                   HTTPHandler(debuglevel=0),
                                   HTTPSHandler(debuglevel=0),
                                   HTTPCookieProcessor(self.cj))
        self.opener.addheaders = [
            ('User-Agent',
             ('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36'
              )),
            ('Accept',
             'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
             )
        ]

        if not os.path.exists(TMP_DIR):
            os.makedirs(TMP_DIR)
示例#10
0
    def __init__(self):
        self.user = bugtracker_user
        self.password = bugtracker_pass
        self.login_page = 'https://bugs.archlinux.org/index.php?do=authenticate'
        #self.target_page = 'http://bugs.archlinux.org/index.php?events%5B%5D=1&events%5B%5D=13&events%5B%5D=2&events%5B%5D=4&event_number=50&do=reports'
        #self.target_page = 'http://bugs.archlinux.org/index.php?events[]=1&events[]=13&events[]=2&events[]=4&fromdate=&todate=&event_number=50&project=0&do=reports&submit='
        self.target_page = 'https://bugs.archlinux.org/index.php?events%5B%5D=1&events%5B%5D=13&events%5B%5D=2&events%5B%5D=4&event_number=50&do=reports&project=0'
        self.cj = CookieJar()
        self.opener = build_opener(HTTPRedirectHandler(),
                                   HTTPHandler(debuglevel=0),
                                   HTTPSHandler(debuglevel=0),
                                   HTTPCookieProcessor(self.cj))
        # self.opener.addheaders = [
        #    ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
        #                   'Windows NT 5.2; .NET CLR 1.1.4322)'))
        # ]
        self.opener.addheaders = [('User-agent', 'Mozilla/5.0')]

        # need this twice - once to set cookies, once to log in...
        self.login()
        #self.login()
        self.old_events = set([])
示例#11
0
class Page(object):
    verb_handler = HTTPHandler()
    if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
        verb_handler.set_http_debuglevel(2)
    redir_handler = HTTPRedirectHandler()
    opener = build_opener(verb_handler, redir_handler)

    def __init__(self):
        pass

    @staticmethod
    def unenscape_Google_bang_URL(old_URL):
        """
        See https://developers.google.com/webmasters\
                /ajax-crawling/docs/getting-started for more information
        """
        if old_URL.find('#!') != -1:
            return old_URL.replace('#!', '?_escaped_fragment_=')
        elif old_URL.startswith('https://groups.google.com/d/topic/'):
            # DEBUG:get_one_topic:URL collected =
            #     https://groups.google.com/d/topic/jbrout/dreCkob3KSs
            # DEBUG:__init__:root_URL =
            #     https://groups.google.com/forum/\
            #        ?_escaped_fragment_=topic/jbrout/dreCkob3KSs
            return old_URL.replace(
                'https://groups.google.com/d/',
                'https://groups.google.com/forum/?_escaped_fragment_=')
        else:
            return old_URL

    def _get_page_BS(self, URL):
        res = self.opener.open(self.unenscape_Google_bang_URL(URL))
        in_str = res.read()
        bs = BeautifulSoup(in_str)
        res.close()
        return bs
示例#12
0
        request.add_unredirected_header('Authorization',
                                        'Bearer ' + auth_token)
        return self.parent.open(request, timeout=request.timeout)


# Got some help from this example https://gist.github.com/FiloSottile/2077115
class HeadRequest(Request):
    def get_method(self):
        return "HEAD"


better_urllib_get = OpenerDirector()
better_urllib_get.addheaders = DEFAULT_HEADERS.copy()
better_urllib_get.add_handler(HTTPHandler())
better_urllib_get.add_handler(HTTPSHandler())
better_urllib_get.add_handler(HTTPRedirectHandler())
better_urllib_get.add_handler(SocketFileHandler())
better_urllib_get.add_handler(Oauth2TokenAuthHandler())


class RegistryError(Exception):
    def __init__(self, response):
        self.response_obj = response


# Util functions
#############################################################################################
def parse_thresholds(spec, include_units=True, units_required=True):
    """
    Given a spec string break it up into ':' separated chunks. Convert strings to ints as it makes sense
示例#13
0
import pickle
import time
import requests

from celerycrawler import settings
from datetime import datetime
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen, Request, HTTPError
from urllib.request import install_opener, build_opener, HTTPRedirectHandler
from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField
from django.core.cache import cache

install_opener(build_opener(HTTPRedirectHandler()))

class Page(Document):
    type = TextField(default="page")
    url = TextField()
    raw = TextField()
    content = TextField()
    links = ListField(TextField())
    rank = FloatField(default=0)
    last_checked = DateTimeField(default=datetime.now)

    def is_valid(self):
        return (datetime.now() - self.last_checked).days < 7

    def update(self):
        print("updating page")
        
        parse = urlparse(self.url)
示例#14
0
    def get_handlers(self):
        handlers = []
        if self._verify_cert == False:
            ctx = ssl.create_default_context()
            ctx.check_hostname = False
            ctx.verify_mode = ssl.CERT_NONE
            handler = request.HTTPSHandler(context=ctx)
            handlers.append(handler)

        from urllib.request import HTTPRedirectHandler
        redirect_handler = HTTPRedirectHandler()
        redirect_handler.max_redirections = 30
        redirect_handler.max_repeats = 30
        handlers.append(redirect_handler)
        if self._username and self._password:

            passman = request.HTTPPasswordMgrWithDefaultRealm()
            passman.add_password(None, self._parsed_org_url, self._username,
                                 self._password)
            handlers.append(request.HTTPBasicAuthHandler(passman))
            passman = request.HTTPPasswordMgrWithDefaultRealm()
            passman.add_password(None, self._parsed_org_url, self._username,
                                 self._password)
            handlers.append(request.HTTPDigestAuthHandler(passman))
            if os.name == 'nt':
                try:
                    from arcgis._impl.common._iwa import NtlmSspiAuthHandler, KerberosSspiAuthHandler

                    auth_krb = KerberosSspiAuthHandler()
                    handlers.append(auth_krb)

                    try:
                        auth_NTLM = NtlmSspiAuthHandler()
                        handlers.append(auth_NTLM)
                    except:
                        pass

                except Error as err:
                    _log.error(
                        "winkerberos packages is required for IWA authentication (NTLM and Kerberos)."
                    )
                    _log.error(
                        "Please install it:\n\tconda install winkerberos")
                    _log.error(str(err))
            else:
                _log.error(
                    'The GIS uses Integrated Windows Authentication which is currently only supported on the Windows platform'
                )


        if self._auth == "PKI" or \
           (self.cert_file is not None and self.key_file is not None):
            handlers.append(
                HTTPSClientAuthHandler(self.key_file, self.cert_file))
        elif self._portal_connection and \
             self._portal_connection.cert_file is not None and \
             self._portal_connection.key_file is not None:
            handlers.append(
                HTTPSClientAuthHandler(self._portal_connection.key_file,
                                       self._portal_connection.cert_file))

        cj = cookiejar.CookieJar()

        if self.proxy_host:  # Simple Proxy Support
            from urllib.request import ProxyHandler
            if self.proxy_port is None:
                self.proxy_port = 80
            proxies = {
                "http": "http://%s:%s" % (self.proxy_host, self.proxy_port),
                "https": "https://%s:%s" % (self.proxy_host, self.proxy_port)
            }
            proxy_support = ProxyHandler(proxies)
            handlers.append(proxy_support)

        handlers.append(request.HTTPCookieProcessor(cj))
        return handlers
示例#15
0
import sys
from urllib.request import OpenerDirector, HTTPRedirectHandler, HTTPSHandler, urlretrieve

REPO_URL = 'https://github.com/facebook/rocksdb'

assert len(sys.argv) > 1, 'Please provide a download directory, e.g. /build'
assert len(sys.argv) < 3, f'Please omit the unexpected arguments: {sys.argv[2:]}'
download_dir = sys.argv[1]

od = OpenerDirector()
od.add_handler(HTTPSHandler())
od.add_handler(HTTPRedirectHandler())

resp = od.open(f'{REPO_URL}/releases/latest/download/')
tag_name = resp.headers['location'].split('/')[-1]

release_url = f'{REPO_URL}/archive/{tag_name}.tar.gz'
file_path, headers = urlretrieve(release_url, f'{download_dir}/latest.tar.gz')

print(f'RocksDB {tag_name} was downloaded to {file_path}', file=sys.stderr)
print(tag_name[1:])