Пример #1
0
    def handle_401(self, response, **kwargs):
        """Takes the given response and tries digest-auth, if needed."""

        original_request = response.request.copy()
        www_authenticate = response.headers.get('www-authenticate', '').lower()
        www_auth_schemes = [x.strip().split()[0] for x in www_authenticate.split(',') if x.strip()]
        auths_to_try = [x for x in www_auth_schemes if x in [y.lower() for y in self.auth_map.keys()]]

        for auth_scheme in auths_to_try:
            for auth_instance in self.auth_map[auth_scheme]:
                #print 'trying', auth_instance, 'for', auth_scheme

                # Consume content and release the original connection
                # to allow our new request to reuse the same one.
                response.content
                response.raw.release_conn()
                prepared_request = original_request.copy()
                prepared_request.hooks = default_hooks()
                prepared_request.prepare_auth(auth_instance)

                adapter = HTTPAdapter()
                if self.session:
                    adapter = self.session() or adapter
                new_response = adapter.send(prepared_request, **kwargs)
                new_response.history.append(response)
                new_response.request = prepared_request
        
                if new_response.status_code != 401:
                    #print auth_instance, 'successful for', auth_scheme
                    self.current_auth = auth_instance
                    return new_response
                response = new_response

        return response
Пример #2
0
    def __init__(self, config):
        self.config = config

        self.pool_manager = requests.Session()

        self.retry_methods = frozenset(['GET', 'HEAD', 'DELETE', 'OPTIONS'])

        # noinspection PyTypeChecker
        adapter = HTTPAdapter(
                pool_connections=config.http_pool_connections,
                pool_maxsize=config.http_pool_size,
                # max_retries=Retry(
                #         method_whitelist=self.retry_methods,
                #         total=config.http_max_retries,
                #         connect=config.http_max_retries,
                #         read=config.http_max_retries,
                #         status_forcelist=range(500, 600)
                # ),
                # pool_block=True
        )
        adapter.max_retries = config.http_max_retries
        self.pool_manager.mount('https://', adapter)
        self.pool_manager.mount('http://', adapter)

        self.pool_manager.verify = bool(self.config.verify_ssl)
Пример #3
0
    def _on_request(self, request, **kwargs):
        match = self._find_match(request)

        # TODO(dcramer): find the correct class for this
        if match is None:
            raise ConnectionError('Connection refused')

        headers = {
            'Content-Type': match['content_type'],
        }
        if match['adding_headers']:
            headers.update(match['adding_headers'])

        response = HTTPResponse(
            status=match['status'],
            body=StringIO(match['body']),
            headers=headers,
            preload_content=False,
        )

        adapter = HTTPAdapter()

        r = adapter.build_response(request, response)
        if not match['stream']:
            r.content  # NOQA
        return r
Пример #4
0
    def _on_request(self, request, **kwargs):
        match = self._find_match(request)

        # TODO(dcramer): find the correct class for this
        if match is None:
            error_msg = 'Connection refused: {0}'.format(request.url)
            response = ConnectionError(error_msg)

            self._calls.add(request, response)
            raise response

        headers = {
            'Content-Type': match['content_type'],
        }
        if match['adding_headers']:
            headers.update(match['adding_headers'])

        response = HTTPResponse(
            status=match['status'],
            body=BufferIO(match['body']),
            headers=headers,
            preload_content=False,
        )

        adapter = HTTPAdapter()

        response = adapter.build_response(request, response)
        if not match['stream']:
            response.content  # NOQA

        self._calls.add(request, response)

        return response
Пример #5
0
 def send(self):
     self.url = "%s%s" % (self.base_url, self.path)
     prepped = self.prepare()
     s = Session()
     # print(self.data)
     h = HTTPAdapter()
     h.max_retries = 10
     s.mount('http://', h)
     s.mount('https://', h)
     response = s.send(prepped)
     response.needs_user_token = self.needs_user_token
     response.original_request = self
     return response
Пример #6
0
    def __init__(self, app_id):

        self.app_id = app_id

        # Provides cookie persistence, connection-pooling, and configuration.
        self.session = requests.Session()

        # Create an requests HTTP adapter and set number of retries to attempt
        adapter = HTTPAdapter()
        adapter.max_retries = 5

        # Register transport adapter for given URL prefix and enable connection retrying.
        self.session.mount(self.API_URL_PREFIX, adapter=adapter)
Пример #7
0
    def send(self, request, **kwargs):
        if (self._is_cache_disabled
            or request.method not in self._cache_allowable_methods):
            response = super(CachedSession, self).send(request, **kwargs)
            response.from_cache = False
            return response

        cache_key = self.cache.create_key(request)

        def send_request_and_cache_response():
            if self._deny_outbound:
                print(request.url)
                raise Exception(("ERROR: OutBound communication was attempted,"
                                 " but deny_outbound was set to True"))

            cache_response = True
            response = super(CachedSession, self).send(request, **kwargs)
            if response.status_code in self._cache_allowable_codes:

                #
                # Special case for cblr:
                # if we get a status of pending then don't cache
                #
                try:
                    if request.url.find('cblr') != -1 and request.method == 'GET':
                        if isinstance(response.json(), dict) and response.json().get('status', '') == 'pending':
                            cache_response = False
                except:
                    cache_response = True
                if cache_response:
                    self.cache.save_response(cache_key, response)

            response.from_cache = False
            return response

        response = self.cache.get_response(cache_key)
        if response is None:
            return send_request_and_cache_response()

        if 'Content-Encoding' in response.headers:
            del response.headers['Content-Encoding']

        adapter = HTTPAdapter()
        response = adapter.build_response(request, response)


        # dispatch hook here, because we've removed it before pickling
        response.from_cache = True
        response = dispatch_hook('response', request.hooks, response, **kwargs)
        return response
Пример #8
0
 def __init__(self, **kwargs):
     super(BetamaxAdapter, self).__init__()
     self.cassette = None
     self.cassette_name = None
     self.http_adapter = HTTPAdapter(**kwargs)
     self.serialize = None
     self.options = {}
Пример #9
0
    def _on_request(self, request, **kwargs):
        match = self._find_match(request)

        # TODO(dcramer): find the correct class for this
        if match is None:
            error_msg = 'Connection refused: {0}'.format(request.url)
            response = ConnectionError(error_msg)

            self._calls.add(request, response)
            raise response

        if 'body' in match and isinstance(match['body'], Exception):
            self._calls.add(request, match['body'])
            raise match['body']

        headers = {
            'Content-Type': match['content_type'],
        }

        if 'callback' in match:  # use callback
            status, r_headers, body = match['callback'](request)
            body = BufferIO(body.encode('utf-8'))
            headers.update(r_headers)

        elif 'body' in match:
            if match['adding_headers']:
                headers.update(match['adding_headers'])
            status = match['status']
            body = BufferIO(match['body'])

        response = HTTPResponse(
            status=status,
            body=body,
            headers=headers,
            preload_content=False,
        )

        adapter = HTTPAdapter()

        response = adapter.build_response(request, response)
        if not match.get('stream'):
            response.content  # NOQA

        self._calls.add(request, response)

        return response
Пример #10
0
    def __init__(self, username, password):
        """
            :username   - Username in 'domain\\username' format
            :password   - Password or hash in "ABCDABCDABCDABCD:ABCDABCDABCDABCD" format.
        """
        if ntlm is None:
            raise Exception("NTLM libraries unavailable")
        #parse the username
        user_parts = username.split('\\', 1)
        self.domain = user_parts[0].upper()
        self.username = user_parts[1]

        self.password = password
        self.adapter = HTTPAdapter()
Пример #11
0
    def __init__(self, username, password):
        """
            :username   - Username in 'domain\\username' format
            :password   - Password or hash in "ABCDABCDABCDABCD:ABCDABCDABCDABCD" format.
        """
        if ntlm is None:
            raise Exception("NTLM libraries unavailable")
        #parse the username
        try:
            self.domain, self.username = username.split('\\', 1)
        except ValueError:
            raise ValueError("username should be in 'domain\\username' format.")
        self.domain = self.domain.upper()

        self.password = password
        self.adapter = HTTPAdapter()
Пример #12
0
    def get_connection(self, *args, **kwargs):
        conn = HTTPAdapter.get_connection(self, *args, **kwargs)

        # Override the urlopen method on this connection
        if not hasattr(conn.urlopen, "wrapped"):
            orig_urlopen = conn.urlopen

            def urlopen(*args, **kwargs):
                timeout = kwargs.pop("timeout", None)
                if isinstance(timeout, Timeout):
                    timeout = Timeout.from_float(timeout.connect_timeout)

                return orig_urlopen(*args, timeout=timeout, **kwargs)

            conn.urlopen = urlopen
            conn.urlopen.wrapped = True

        return conn
Пример #13
0
class Foauth(BaseAdapter):
    """The foauth.org transport adapter."""
    def __init__(self, username, password):
        self.auth = (username, password)
        self.http = HTTPAdapter()

    def prepare_request(self, request):
        p = urlparse(request.url)

        # Rewrite the url to use foauth.org
        request.url = FOAUTH_TEMPLATE.format(domain=p.netloc, path=p.path)
        # Authenticate appropriately.
        request.prepare_auth(self.auth)

        return request

    def send(self, request, **kwargs):
        request = self.prepare_request(request)
        return self.http.send(request, **kwargs)
Пример #14
0
 def proxy_manager_for(self, *args, **kwargs):
     kwargs["ssl_context"] = self.ssl_context
     return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
Пример #15
0
    def upload(
        self,
        parts: Union[VideoPart, List[VideoPart]],
        title: str,
        tid: int,
        tag: List[str],
        desc: str,
        source='',
        cover='',
        no_reprint: bool = True,
        dynamic='',
        dtime=None,
        open_elec: bool = True,
        open_subtitle: bool = True,
        max_retry=5,
    ):
        """

        :param parts: 视频列表 VideoPart('part path', 'part title', 'part desc'), 或者 [VideoPart(...), VideoPart(...)]
        :type parts: Union[VideoPart, List[VideoPart]]
        :param title: 视频标题
        :type title: str
        :param tid: 视频分区ID, 参考: https://member.bilibili.com/x/web/archive/pre
                                    或者https://github.com/uupers/BiliSpider/wiki/%E8%A7%86%E9%A2%91%E5%88%86%E5%8C%BA%E5%AF%B9%E5%BA%94%E8%A1%A8
        :type tid: int
        :param tag: 视频tag
        :type tag: List[str]
        :param desc: 视频简介
        :type desc: str
        :param dtime: 定时发布的时间戳,可选(optional) publish date timestamp (10 digits Unix timestamp e.g. 1551533438)
        :type dtime: int
        :param source: 可选:(optional) 转载地址
        :type source: str
        :param cover: 可选:视频封面(optional) cover's URL, use method *cover_up* to get
        :type cover: str
        :param no_reprint: 可选:是否允许转载(optional) Is reprint allowed
        :type no_reprint: bool
        :param dynamic: 粉丝动态
        :type dynamic: str
        :param open_elec: 可选:是否开启充电(optional) whether to open charging panel
        :type open_elec: bool
        :param open_subtitle: 可选:是否允许上传字幕(optional) Is uploading subtitles allowed
        :type open_subtitle: bool
        :param max_retry: 可选:每块最大重试时间(optional) max retry times per chunk
        :type max_retry: int
        """

        if len(title) > 80:
            raise Exception("标题长度超过80字")
        if len(source) > 200:
            raise Exception("转载地址长度超过200字")

        self.session.headers[
            'Content-Type'] = 'application/json; charset=utf-8'
        if not isinstance(parts, list):
            parts = [parts]

        # retry by status
        retries = Retry(
            total=max_retry,
            backoff_factor=1,
            status_forcelist=(504, ),
        )
        self.session.mount('https://', HTTPAdapter(max_retries=retries))
        self.session.mount('http://', HTTPAdapter(max_retries=retries))
        #

        videos = []
        for part in parts:
            filepath = part.path
            filename = os.path.basename(filepath)
            filesize = os.path.getsize(filepath)
            r = self.session.get(
                'https://member.bilibili.com/preupload?'
                'os=upos&upcdn=ws&name={name}&size={size}&r=upos&profile=ugcupos%2Fyb&ssl=0'
                .format(name=parse.quote_plus(filename), size=filesize))
            """return example
            {
                "upos_uri": "upos://ugc/i181012ws18x52mti3gg0h33chn3tyhp.mp4",
                "biz_id": 58993125,
                "endpoint": "//upos-hz-upcdnws.acgvideo.com",
                "endpoints": [
                    "//upos-hz-upcdnws.acgvideo.com",
                    "//upos-hz-upcdntx.acgvideo.com"
                ],
                "chunk_retry_delay": 3,
                "chunk_retry": 200,
                "chunk_size": 4194304,
                "threads": 2,
                "timeout": 900,
                "auth": "os=upos&cdn=upcdnws&uid=&net_state=4&device=&build=&os_version=&ak=×tamp=&sign=",
                "OK": 1
            } 
            """
            json = r.json()
            upos_uri = json['upos_uri']
            endpoint = json['endpoint']
            auth = json['auth']
            biz_id = json['biz_id']
            chunk_size = json['chunk_size']
            self.session.headers['X-Upos-Auth'] = auth  # add auth header
            r = self.session.post('https:{}/{}?uploads&output=json'.format(
                endpoint, upos_uri.replace('upos://', '')))
            # {"upload_id":"72eb747b9650b8c7995fdb0efbdc2bb6","key":"\/i181012ws2wg1tb7tjzswk2voxrwlk1u.mp4","OK":1,"bucket":"ugc"}
            json = r.json()
            upload_id = json['upload_id']

            with open(filepath, 'rb') as f:
                chunks_num = math.ceil(filesize / chunk_size)
                chunks_index = -1
                while True:
                    chunks_data = f.read(chunk_size)
                    if not chunks_data:
                        break
                    chunks_index += 1  # start with 0

                    def upload_chunk():
                        r = self.session.put(
                            'https:{endpoint}/{upos_uri}?'
                            'partNumber={part_number}&uploadId={upload_id}&chunk={chunk}&chunks={chunks}&size={size}&start={start}&end={end}&total={total}'
                            .format(
                                endpoint=endpoint,
                                upos_uri=upos_uri.replace('upos://', ''),
                                part_number=chunks_index + 1,  # starts with 1
                                upload_id=upload_id,
                                chunk=chunks_index,
                                chunks=chunks_num,
                                size=len(chunks_data),
                                start=chunks_index * chunk_size,
                                end=chunks_index * chunk_size +
                                len(chunks_data),
                                total=filesize,
                            ),
                            chunks_data,
                        )
                        return r

                    def retry_upload_chunk():
                        """return :class:`Response` if upload success, else return None."""
                        for i in range(max_retry):
                            r = upload_chunk()
                            if r.status_code == 200:
                                return r
                            log.info(r.text)
                            log.info('{}/{} retry stage {}/{}'.format(
                                chunks_index, chunks_num, i, max_retry))
                            log.info('sleep %ds', 5 * i)
                            time.sleep(5 * i)
                        return None

                    r = retry_upload_chunk()
                    if r:
                        log.info('upload part {}/{}'.format(
                            chunks_index, chunks_num))
                    else:
                        raise Exception(
                            'upload reach max retry times at part {}/{}'.
                            format(chunks_index, chunks_num))

                # NOT DELETE! Refer to https://github.com/comwrg/bilibiliupload/issues/15#issuecomment-424379769
                self.session.post(
                    'https:{endpoint}/{upos_uri}?'
                    'output=json&name={name}&profile=ugcupos%2Fyb&uploadId={upload_id}&biz_id={biz_id}'
                    .format(
                        endpoint=endpoint,
                        upos_uri=upos_uri.replace('upos://', ''),
                        name=filename,
                        upload_id=upload_id,
                        biz_id=biz_id,
                    ),
                    {
                        "parts": [{
                            "partNumber": i,
                            "eTag": "etag"
                        } for i in range(1, chunks_num + 1)]
                    },
                )

            videos.append({
                'filename':
                upos_uri.replace('upos://ugc/', '').split('.')[0],
                'title':
                part.title,
                'desc':
                part.desc
            })

        # if source is empty, copyright=1, else copyright=2
        copyright = 2 if source else 1

        def add():
            r = self.session.post(
                'https://member.bilibili.com/x/vu/web/add?csrf=' + self.csrf,
                json={
                    "copyright": copyright,
                    "source": source,
                    "title": title,
                    "tid": tid,
                    "tag": ','.join(tag),
                    "no_reprint": int(no_reprint),
                    "desc": desc,
                    "cover": cover,
                    "mission_id": 0,
                    "order_id": 0,
                    "videos": videos,
                    "dtime": dtime,
                    "open_elec": int(open_elec),
                    "dynamic": dynamic,
                    "subtitle": {
                        "lan": "",
                        "open": int(open_subtitle),
                    },
                },
            )
            return r

        def retry_add():
            for i in range(max_retry):
                r = add()
                json = r.json()
                code = json['code']
                if code == 0:
                    return r
                # {"code":20001,"message":"投稿服务异常","ttl":1}
                if code in (20001, ):
                    log.info('retry add video {}/{}, {}'.format(
                        i, max_retry, r.text))
                else:
                    raise Exception('Fail to add video, {}'.format(r.text))
                log.info('sleep %ds', 5 * i)
                time.sleep(5 * i)
            raise Exception('Add video reach max retry times.')

        r = retry_add()
        return r.json()
Пример #16
0
    def __init__(self,
                 host,
                 port,
                 token,
                 index,
                 allow_overrides=False,
                 debug=False,
                 flush_interval=15.0,
                 force_keep_ahead=False,
                 hostname=None,
                 protocol='https',
                 proxies=None,
                 queue_size=DEFAULT_QUEUE_SIZE,
                 record_format=False,
                 retry_backoff=2.0,
                 retry_count=5,
                 source=None,
                 sourcetype='text',
                 timeout=60,
                 verify=True):
        """
        Args:
            host (str): The Splunk host param
            port (int): The port the host is listening on
            token (str): Authentication token
            index (str): Splunk index to write to
            allow_overrides (bool): Whether to look for _<param> in log data (ex: _index)
            debug (bool): Whether to print debug console messages
            flush_interval (float): How often thread should run to push events to splunk host
            force_keep_ahead (bool): Sleep instead of dropping logs when queue fills
            hostname (str): The Splunk Enterprise hostname
            protocol (str): The web protocol to use
            proxies (list): The proxies to use for the request
            queue_size (int): The max number of logs to queue, set to 0 for no max
            record_format (bool): Whether the log record will be json
            retry_backoff (float): The requests lib backoff factor
            retry_count (int): The number of times to retry a failed request
            source (str): The Splunk source param
            sourcetype (str): The Splunk sourcetype param
            timeout (float): The time to wait for a response from Splunk
            verify (bool): Whether to perform ssl certificate validation
        """

        global instances
        instances.append(self)
        logging.Handler.__init__(self)

        self.allow_overrides = allow_overrides
        self.host = host
        self.port = port
        self.token = token
        self.index = index
        self.source = source
        self.sourcetype = sourcetype
        self.verify = verify
        self.timeout = timeout
        self.flush_interval = flush_interval
        self.force_keep_ahead = force_keep_ahead
        self.log_payload = ""
        self.SIGTERM = False  # 'True' if application requested exit
        self.timer = None
        # It is possible to get 'behind' and never catch up, so we limit the queue size
        self.queue = list()
        self.max_queue_size = max(queue_size, 0)  # 0 is min queue size
        self.debug = debug
        self.session = requests.Session()
        self.retry_count = retry_count
        self.retry_backoff = retry_backoff
        self.protocol = protocol
        self.proxies = proxies
        self.record_format = record_format

        # Keep ahead depends on queue size, so cannot be 0
        if self.force_keep_ahead and not self.max_queue_size:
            self.write_log(
                "Cannot keep ahead of unbound queue, using default queue size")
            self.max_queue_size = DEFAULT_QUEUE_SIZE

        self.write_debug_log("Starting debug mode")

        if hostname is None:
            self.hostname = socket.gethostname()
        else:
            self.hostname = hostname

        self.write_debug_log("Preparing to override loggers")

        # prevent infinite recursion by silencing requests and urllib3 loggers
        logging.getLogger('requests').propagate = False
        logging.getLogger('urllib3').propagate = False

        # and do the same for ourselves
        logging.getLogger(__name__).propagate = False

        # disable all warnings from urllib3 package
        if not self.verify:
            requests.packages.urllib3.disable_warnings()

        if self.verify and self.protocol == 'http':
            print("[SplunkHandler DEBUG] " +
                  'cannot use SSL Verify and unsecure connection')

        if self.proxies is not None:
            self.session.proxies = self.proxies

        # Set up automatic retry with back-off
        self.write_debug_log("Preparing to create a Requests session")
        retry = Retry(
            total=self.retry_count,
            backoff_factor=self.retry_backoff,
            method_whitelist=False,  # Retry for any HTTP verb
            status_forcelist=[500, 502, 503, 504])
        self.session.mount(self.protocol + '://',
                           HTTPAdapter(max_retries=retry))

        self.start_worker_thread()

        self.write_debug_log("Class initialize complete")
Пример #17
0
class Youdao(AbstractDictionary):
    name = '有道词典'
    timeout = 10
    headers = {
        'Host':
        'dict.youdao.com',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    }
    retries = Retry(total=5,
                    backoff_factor=1,
                    status_forcelist=[500, 502, 503, 504])
    session = requests.Session()
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))

    def __init__(self):
        self.groups = []

    def login(self, username: str, password: str, cookie: dict = None) -> dict:
        """
        登陆
        :param username: 用户名
        :param password: 密码
        :param cookie: cookie
        :return: cookie dict
        """
        self.session.cookies.clear()
        if cookie and self._checkCookie(cookie):
            return cookie
        else:
            return self._login(username, password)

    def _checkCookie(self, cookie) -> bool:
        """
        cookie有效性检验
        :param cookie:
        :return: bool
        """
        rsp = requests.get('http://dict.youdao.com/wordbook/wordlist',
                           cookies=cookie,
                           headers=self.headers)
        if 'account.youdao.com/login' not in rsp.url:
            self.indexSoup = BeautifulSoup(rsp.text, features="html.parser")
            logger.info('Cookie有效')
            cookiesJar = requests.utils.cookiejar_from_dict(cookie,
                                                            cookiejar=None,
                                                            overwrite=True)
            self.session.cookies = cookiesJar
            return True
        logger.info('Cookie失效')
        return False

    def _login(self, username: str, password: str) -> dict:
        """账号和密码登陆"""
        data = (
            ('app', 'mobile'),
            ('product', 'DICT'),
            ('tp', 'urstoken'),
            ('cf', '7'),
            ('show', 'true'),
            ('format', 'json'),
            ('username', username),
            ('password', hashlib.md5(password.encode('utf-8')).hexdigest()),
            ('um', 'true'),
        )
        try:
            self.session.post(url='https://dict.youdao.com/login/acc/login',
                              timeout=self.timeout,
                              headers=self.headers,
                              data=data)
            cookie = requests.utils.dict_from_cookiejar(self.session.cookies)
            if username and username.lower() in cookie.get('DICT_SESS', ''):
                #  登陆后获取单词本首页的soup对象
                rsp = self.session.get(
                    'http://dict.youdao.com/wordbook/wordlist',
                    timeout=self.timeout)
                self.indexSoup = BeautifulSoup(rsp.text,
                                               features="html.parser")
                logger.info('登陆成功')
                return cookie
            else:
                logger.error('登陆失败')
                return {}
        except Exception as error:
            logger.exception(f'网络异常:{error}')
            return {}

    def getGroups(self) -> [(str, int)]:
        """
        获取单词本分组
        :return: [(group_name,group_id)]
        """
        elements = self.indexSoup.find('select', id='select_category')
        groups = []
        if elements:
            groups = elements.find_all('option')
            groups = [(e.text, e['value']) for e in groups]
        logger.info(f'单词本分组:{groups}')
        self.groups = groups

        return groups

    def getTotalPage(self, groupName: str, groupId: int) -> int:
        """
        获取分组下总页数
        :param groupName: 分组名称
        :param groupId:分组id
        :return:
        """
        totalPages = 1
        try:
            r = self.session.get(
                url='http://dict.youdao.com/wordbook/wordlist',
                timeout=self.timeout,
                params={'tags': groupId})
            soup = BeautifulSoup(r.text, features='html.parser')
            pagination = soup.find('div', id='pagination')
            if pagination:
                finalPageHref = pagination.find_all(
                    'a', class_='next-page')[-1].get('href')
                groups = re.search(r"wordlist\?p=(\d*)", finalPageHref)
                if groups:
                    totalPages = int(groups.group(1))
            else:
                totalPages = 1
        except Exception as error:
            logger.exception(f'网络异常{error}')

        finally:
            totalPages = totalPages - 1 if totalPages > 1 else totalPages
            logger.info(f'该分组({groupName}-{groupId})下共有{totalPages}页')
            return totalPages

    def getWordsByPage(self, pageNo: int, groupName: str,
                       groupId: str) -> [str]:
        """
        获取分组下每一页的单词
        :param pageNo: 页数
        :param groupName: 分组名
        :param groupId: 分组id
        :return:
        """
        wordList = []
        try:
            logger.info(f'获取单词本(f{groupName}-{groupId})第:{pageNo + 1}页')
            rsp = self.session.get(
                'http://dict.youdao.com/wordbook/wordlist',
                params={
                    'p': pageNo,
                    'tags': groupId
                },
            )
            soup = BeautifulSoup(rsp.text, features='html.parser')
            table = soup.find(id='wordlist').table.tbody
            rows = table.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                wordList.append(cols[1].div.a.text.strip())
        except Exception as e:
            logger.exception(f'网络异常{e}')
        finally:
            logger.info(wordList)
            return wordList
Пример #18
0
import requests, json, os, sys
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import mapping.mapping as m
import conf as c

REQ_SESSION = requests.Session()
retries = Retry(total=10,
                backoff_factor=1,
                status_forcelist=[502, 503, 504, 524])
REQ_SESSION.mount('http://', HTTPAdapter(max_retries=retries))
#REQ_SESSION.mount('https://', HTTPAdapter(max_retries=retries))

params = {
    'key_identity': c.CONF["KEY_IDENTITY"],
    'key_credential': c.CONF["KEY_CREDENTIALS"]
}
#print("\nc.CONF",c.CONF)
print("\n")

########################
############ manually
########################

# 1. upload ontologies
# 2. upload custom controlled vocabularies: use the same names as in vocabularies.json
# 3. copy the ids of vocabularies in vocabularies.json and substitute IDs in "templates" folder (except for City,District,Country)
# 4. reconcile City,District,Country to geonames and save mappings in vocabularies.json
# 5. upload templates where vocabularies are already selected (import in next instances -- control vocab ids match with correct number)
# 6. download google spreadsheet tables as tsv in "tables" folder
Пример #19
0
 def get_connection(self, url, proxies=None):
     url = url.replace(self.redirect_source, self.redirect_target)
     return HTTPAdapter.get_connection(self, url, proxies=proxies)
Пример #20
0
 def set_max_retry(self, for_url, max_retries):
     self._session.mount(for_url, HTTPAdapter(max_retries=max_retries))
Пример #21
0
class BetamaxAdapter(BaseAdapter):

    """This object is an implementation detail of the library.

    It is not meant to be a public API and is not exported as such.

    """

    def __init__(self, **kwargs):
        super(BetamaxAdapter, self).__init__()
        self.cassette = None
        self.cassette_name = None
        self.old_adapters = kwargs.pop('old_adapters', {})
        self.http_adapter = HTTPAdapter(**kwargs)
        self.serialize = None
        self.options = {}

    def cassette_exists(self):
        if self.cassette_name and os.path.exists(self.cassette_name):
            return True
        return False

    def close(self):
        self.http_adapter.close()

    def eject_cassette(self):
        if self.cassette:
            self.cassette.eject()
        self.cassette = None  # Allow self.cassette to be garbage-collected

    def load_cassette(self, cassette_name, serialize, options):
        self.cassette_name = cassette_name
        self.serialize = serialize
        self.options.update(options.items())
        placeholders = self.options.get('placeholders', [])

        default_options = Cassette.default_cassette_options

        match_requests_on = self.options.get(
            'match_requests_on', default_options['match_requests_on']
            )

        preserve_exact_body_bytes = self.options.get(
            'preserve_exact_body_bytes',
            )

        self.cassette = Cassette(
            cassette_name, serialize, placeholders=placeholders,
            record_mode=self.options.get('record'),
            preserve_exact_body_bytes=preserve_exact_body_bytes
            )

        if 'record' in self.options:
            self.cassette.record_mode = self.options['record']
        self.cassette.match_options = match_requests_on

        re_record_interval = timedelta.max
        if self.options.get('re_record_interval'):
            re_record_interval = timedelta(self.options['re_record_interval'])

        now = datetime.utcnow()
        if re_record_interval < (now - self.cassette.earliest_recorded_date):
            self.cassette.clear()

    def send(self, request, stream=False, timeout=None, verify=True,
             cert=None, proxies=None):
        interaction = None

        if not self.cassette:
            raise BetamaxError('No cassette was specified or found.')

        if self.cassette.interactions:
            interaction = self.cassette.find_match(request)

        if not interaction and self.cassette.is_recording():
            interaction = self.send_and_record(
                request, stream, timeout, verify, cert, proxies
                )

        if not interaction:
            raise BetamaxError(unhandled_request_message(request,
                                                         self.cassette))

        resp = interaction.as_response()
        resp.connection = self
        return resp

    def send_and_record(self, request, stream=False, timeout=None,
                        verify=True, cert=None, proxies=None):
        adapter = self.find_adapter(request.url)
        response = adapter.send(
            request, stream=True, timeout=timeout, verify=verify,
            cert=cert, proxies=proxies
            )
        self.cassette.save_interaction(response, request)
        return self.cassette.interactions[-1]

    def find_adapter(self, url):
        for (prefix, adapter) in self.old_adapters.items():

            if url.lower().startswith(prefix):
                return adapter
Пример #22
0
from bs4 import BeautifulSoup
import requests, os, csv, urllib.parse, urllib3, time, random
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

urllib3.disable_warnings()

retry_strategy = Retry(
    total=20,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["HEAD", "GET", "OPTIONS"],
    backoff_factor=2
)

adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

def get_proxies():
    proxyList = [
        {"http":"169.159.179.248:8080"}, {"http":"105.208.17.58:8080"}, {"http":"160.119.44.210:8080"}, {"http":"196.214.145.106:80"},
        {"https":"41.194.37.106:45381"},{"https":"41.222.159.191:8080"},{"https":"66.251.179.207:8080"}
        ]
    return proxyList

def rotate_proxy(proxies):
    proxy_select = random.randint(-1,len(proxies)-1)
    if proxy_select < 0 or os.getenv('proxy_enabled', default=False):
        time.sleep(random.uniform(0.5, 1.25))
        return None
Пример #23
0
            'url': download_url,
            'expire': (now + expire).strftime('%Y-%m-%d')
        }
        UrlRecorder.save_record(entry['class_name'], record)

    def _request(self, entry, method, url, **kwargs):
        if not self.requests:
            self.requests = requests.Session()
            if entry_headers := entry.get('headers'):
                if brotli:
                    entry_headers['accept-encoding'] = 'gzip, deflate, br'
                self.requests.headers.update(entry_headers)
            if entry_cookie := entry.get('cookie'):
                self.requests.cookies.update(
                    NetUtils.cookie_str_to_dict(entry_cookie))
            self.requests.mount('http://', HTTPAdapter(max_retries=2))
            self.requests.mount('https://', HTTPAdapter(max_retries=2))
        try:
            response = self.requests.request(method, url, timeout=60, **kwargs)
            if response is not None and response.content:
                if re.search(
                        NetworkErrorReason.DDoS_protection_by_Cloudflare.value,
                        NetUtils.decode(response)):
                    cf_cookie = asyncio.run(SiteBase.get_cf_cookie(entry))
                    self.requests.cookies.update(
                        NetUtils.cookie_str_to_dict(cf_cookie))
                    response = self.requests.request(method,
                                                     url,
                                                     timeout=60,
                                                     **kwargs)
            return response
Пример #24
0
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
}
proxies = {'http': '127.0.0.1:8080'}
inject_point = {'id': '2'}

conf = {
    'url': url,
    'method': 'post',
    'proxies': {},
    'inject': inject_point,
    'debug': False
}
conf['proxies'] = proxies
req = requests.session()
req.headers = headers
req.mount('http://', HTTPAdapter(max_retries=3))
req.mount('https://', HTTPAdapter(max_retries=3))
req.proxies = conf['proxies']


def main():
    print(
        'Help info\nPlease provide your action:[get_current_user|get_current_db|get_dbs|get_tables|get_columns|read_file|dump|dump_all]'
    )
    print('main')
    act = raw_input("Please raw_input your action:")
    action(act)


def put_file_contents(filename, contents):
    with open(filename, "a+") as fin:
Пример #25
0
#    Alex Mueller     #
#                     #
#######################

import os
import io
import requests
from requests.adapters import HTTPAdapter
from requests.exceptions import ConnectionError
from PIL import Image, ImageOps, ImageDraw
from io import BytesIO

SITE_URL = 'https://images.alexonsager.net'
SPRITES = './Assets/Sprites' 

pokemonAdapter = HTTPAdapter(max_retries = 10)
session = requests.Session()
session.mount(SITE_URL, pokemonAdapter)

pokemon = ["Missingno.","Bulbasaur","Ivysaur","Venusaur","Charmander","Charmeleon","Charizard","Squirtle","Wartortle","Blastoise","Caterpie","Metapod","Butterfree","Weedle","Kakuna","Beedrill","Pidgey","Pidgeotto","Pidgeot","Rattata","Raticate","Spearow","Fearow","Ekans","Arbok","Pikachu","Raichu","Sandshrew","Sandslash","Nidoran(f)","Nidorina","Nidoqueen","Nidoran(m)","Nidorino","Nidoking","Clefairy","Clefable","Vulpix","Ninetales","Jigglypuff","Wigglytuff","Zubat","Golbat","Oddish","Gloom","Vileplume","Paras","Parasect","Venonat","Venomoth","Diglett","Dugtrio","Meowth","Persian","Psyduck","Golduck","Mankey","Primeape","Growlithe","Arcanine","Poliwag","Poliwhirl","Poliwrath","Abra","Kadabra","Alakazam","Machop","Machoke","Machamp","Bellsprout","Weepinbell","Victreebel","Tentacool","Tentacruel","Geodude","Graveler","Golem","Ponyta","Rapidash","Slowpoke","Slowbro","Magnemite","Magneton","Farfetchd","Doduo","Dodrio","Seel","Dewgong","Grimer","Muk","Shellder","Cloyster","Gastly","Haunter","Gengar","Onix","Drowzee","Hypno","Krabby","Kingler","Voltorb","Electrode","Exeggcute","Exeggutor","Cubone","Marowak","Hitmonlee","Hitmonchan","Lickitung","Koffing","Weezing","Rhyhorn","Rhydon","Chansey","Tangela","Kangaskhan","Horsea","Seadra","Goldeen","Seaking","Staryu","Starmie","Mr. Mime","Scyther","Jynx","Electabuzz","Magmar","Pinsir","Tauros","Magikarp","Gyarados","Lapras","Ditto","Eevee","Vaporeon","Jolteon","Flareon","Porygon","Omanyte","Omastar","Kabuto","Kabutops","Aerodactyl","Snorlax","Articuno","Zapdos","Moltres","Dratini","Dragonair","Dragonite","Mewtwo","Mew"]
prefixes = ["Miss","Bulb","Ivy","Venu","Char","Char","Char","Squirt","War","Blast","Cater","Meta","Butter","Wee","Kak","Bee","Pid","Pidg","Pidg","Rat","Rat","Spear","Fear","Ek","Arb","Pika","Rai","Sand","Sand","Nido","Nido","Nido","Nido","Nido","Nido","Clef","Clef","Vul","Nine","Jiggly","Wiggly","Zu","Gol","Odd","Gloo","Vile","Pa","Para","Veno","Veno","Dig","Dug","Meow","Per","Psy","Gol","Man","Prime","Grow","Arca","Poli","Poli","Poli","Ab","Kada","Ala","Ma","Ma","Ma","Bell","Weepin","Victree","Tenta","Tenta","Geo","Grav","Gol","Pony","Rapi","Slow","Slow","Magne","Magne","Far","Do","Do","See","Dew","Gri","Mu","Shell","Cloy","Gas","Haunt","Gen","On","Drow","Hyp","Krab","King","Volt","Electr","Exegg","Exegg","Cu","Maro","Hitmon","Hitmon","Licki","Koff","Wee","Rhy","Rhy","Chan","Tang","Kangas","Hors","Sea","Gold","Sea","Star","Star","Mr.","Scy","Jyn","Electa","Mag","Pin","Tau","Magi","Gyara","Lap","Dit","Ee","Vapor","Jolt","Flare","Pory","Oma","Oma","Kabu","Kabu","Aero","Snor","Artic","Zap","Molt","Dra","Dragon","Dragon","Mew","Mew"]
postfixes = ["ssingno.","basaur","ysaur","usaur","mander","meleon","izard","tle","tortle","toise","pie","pod","free","dle","una","drill","gey","eotto","eot","tata","icate","row","row","kans","bok","chu","chu","shrew","slash","oran","rina","queen","ran","rino","king","fairy","fable","pix","tales","puff","tuff","bat","bat","ish","oom","plume","ras","sect","nat","moth","lett","trio","th","sian","duck","duck","key","ape","lithe","nine","wag","whirl","wrath","ra","bra","kazam","chop","choke","champ","sprout","bell","bell","cool","cruel","dude","eler","em","ta","dash","poke","bro","mite","ton","fetchd","duo","drio","eel","gong","mer","uk","der","ster","tly","ter","gar","ix","zee","no","by","ler","orb","ode","cute","utor","bone","wak","lee","chan","tung","fing","zing","horn","don","sey","gela","khan","sea","dra","deen","king","yu","mie","mime","ther","nx","buzz","mar","sir","ros","karp","dos","ras","to","vee","eon","eon","eon","gon","nyte","star","to","tops","dactyl","lax","cuno","dos","tres","tini","nair","nite","two","ew"]

if not os.path.exists(SPRITES):
    os.makedirs(SPRITES)

for i in range(152):
    for j in range(152):
        name = ""
        url = "%s/pokemon/fused/%d/%d.%d.png" % (SITE_URL, i, i, j)
        
Пример #26
0
__credits__ = ["Huevos", "WanWizard"]
__license__ = "GPL"
__version__ = "1.0.1"

POLARISATION = {'H': 0, 'V': 1, 'L': 2, 'R': 3}
SYSTEMS = {'DVB-S': 0, 'DVB-S2': 1, 'DSS': -1, 'ISDB': -1,
           'Digicipher 2': -1, 'ABS': -1}
FECS = {'auto': 0, '1/2': 1, '2/3': 2, '3/4': 3, '3/5': 4, '4/5':
        5, '5/6': 6, '6/7': 7, '7/8': 8, '8/9': 9, '9/10': 10, 'none': 15}
MODULATIONS = {'auto': 0, 'QPSK': 1, '8PSK':2, 'QAM16': 3, '16APSK': 4,
               '32APSK': 5, '8PSK Turbo': -1, 'Turbo': -1}
SLEEP_TIMEOUT = 10
PARSER = 'html5lib'

SESSION = requests.Session()
SESSION.mount('http://', HTTPAdapter(max_retries=5))
SESSION.mount('https://', HTTPAdapter(max_retries=5))
SESSION.headers.update({'User-Agent': 'Mozilla/5.0'})

IS_PY3 = sys.version_info >= (3, 0)


def eprint(*args, **kwargs):
    """ print data in std error """
    print(*args, file=sys.stderr, **kwargs)


def escape(title):
    """ xml escape title """
    title = title.replace('&', '&amp;').replace('<', '&lt;')
    return title.replace('>', '&gt;').replace('\"', '&quot;')
Пример #27
0
def download_file(URL,
                  name_file,
                  path_out,
                  retries=10,
                  backoff=10,
                  size_format='Decimal',
                  show_download_progress=True):
    '''

    Save data in file.

    Parameters
    ----------
    URL : str
        Link of file.

    name_file : str 
        Name of output file.

    path_out : str, optional, default ''
        Path of folder where file will be saved.

    retries : int, optional, default 10
        Defines the retries number to connect to server.
        See: https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry

    backoff: int, optional, default 10
        A backoff factor to apply between attempts after the second try.
        See: https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry


    size_format: str, optional, default 'Decimal'
        Defines how is print the size of file.
        Options are:
            'Decimal' : divide file size (in bytes) by (1000*1000) 
            'Binary' : divide file size (in bytes) by (1024*1024)

    show_download_progress : boolean, optional, default True
        Parameter to enable and disable the visualization of download progress.

    '''

    StartTime = datetime.now()

    retries_config = Retry(total=retries,
                           backoff_factor=backoff,
                           status_forcelist=[500, 502, 503, 504])

    session = requests.Session()
    session.mount('http://', HTTPAdapter(max_retries=retries_config))
    session.mount('https://', HTTPAdapter(max_retries=retries_config))
    req = session.get(URL, stream=True)
    #req = requests.get(URL, stream=True)
    total_size = int(req.headers['content-length'])
    size = 0
    if size_format == 'Binary':
        dsize = 1024 * 1024
    else:
        dsize = 1000 * 1000

    with open(path_out + name_file, 'wb') as output_file:
        for chunk in req.iter_content(chunk_size=1024):
            if chunk:
                rec_size = output_file.write(chunk)
                size = rec_size + size
                if show_download_progress == True:
                    print('  {} {:3.0f}% {:.1f}MB {}'.format(
                        name_file, 100.0 * size / total_size, size / dsize,
                        '{}m{}s'.format(
                            round((datetime.now() - StartTime).seconds / 60.0),
                            (datetime.now() - StartTime).seconds % 60) if
                        (datetime.now() - StartTime).seconds > 60 else
                        '{}s'.format((datetime.now() - StartTime).seconds)),
                          end="\r")  #, flush=True)
                    #print('\t{}\t{:3.0f}%\t{:.2f} min'.format(name_file,100.0*size/total_size, (datetime.now()-StartTime).seconds/60.0), end="\r") #, flush=True)
                    if size == total_size:
                        #print('\n')
                        print('  {} {:3.0f}% {:.1f}MB {}'.format(
                            name_file, 100.0 * size / total_size, size / dsize,
                            '{}m{}s'.format(
                                round((datetime.now() - StartTime).seconds /
                                      60.0),
                                (datetime.now() - StartTime).seconds % 60) if
                            (datetime.now() - StartTime).seconds > 60 else
                            '{}s'.format(
                                (datetime.now() - StartTime).seconds)))
Пример #28
0
def make_request(method, url, conn, stream=False, **kwargs):
    """
    Makes a REST request.

    Parameters
    ----------
    method : {'GET', 'POST', 'PUT', 'DELETE'}
        HTTP method.
    url : str
        URL.
    conn : :class:`Connection`
        Connection authentication and configuration.
    stream : bool, default False
        Whether to stream the response contents.
    **kwargs
        Initialization arguments to :class:`requests.Request`.

    Returns
    -------
    :class:`requests.Response`

    """
    if method.upper() not in _VALID_HTTP_METHODS:
        raise ValueError(
            "`method` must be one of {}".format(_VALID_HTTP_METHODS))

    # add auth to headers
    kwargs.setdefault('headers', {}).update(conn.auth)

    with requests.Session() as session:
        session.mount(url, HTTPAdapter(max_retries=conn.retry))
        try:
            request = requests.Request(method, url, **kwargs).prepare()

            # retry loop for broken connections
            MAX_RETRIES = conn.retry.total
            for retry_num in range(MAX_RETRIES + 1):
                logger.debug("Making request ({} retries)".format(retry_num))
                try:
                    response = _make_request(session,
                                             request,
                                             conn.ignore_conn_err,
                                             stream=stream)
                except requests.ConnectionError as e:
                    if ((retry_num == MAX_RETRIES)
                            or ("BrokenPipeError" not in str(e))):
                        if not conn.ignore_conn_err:
                            raise e
                        else:
                            return fabricate_200()
                    time.sleep(1)
                else:
                    break

        except (requests.exceptions.BaseHTTPError,
                requests.exceptions.RequestException) as e:
            if not conn.ignore_conn_err:
                raise e
            # else fall through to fabricate 200 response
        else:
            if response.ok or not conn.ignore_conn_err:
                return response
            # else fall through to fabricate 200 response
        return fabricate_200()
Пример #29
0
class HttpNtlmAuth(AuthBase):
    """HTTP NTLM Authentication Handler for Requests. Supports pass-the-hash."""

    def __init__(self, username, password):
        """
            :username   - Username in 'domain\\username' format
            :password   - Password or hash in "ABCDABCDABCDABCD:ABCDABCDABCDABCD" format.
        """
        if ntlm is None:
            raise Exception("NTLM libraries unavailable")
        #parse the username
        try:
            self.domain, self.username = username.split('\\', 1)
        except ValueError:
            raise ValueError("username should be in 'domain\\username' format.")
        self.domain = self.domain.upper()

        self.password = password
        self.adapter = HTTPAdapter()

    def retry_using_http_NTLM_auth(self, auth_header_field, auth_header,
                                   response, args):
        """Attempts to authenticate using HTTP NTLM challenge/response"""

        if auth_header in response.request.headers:
            return response

        request = copy_request(response.request)
        

        # initial auth header with username. will result in challenge
        auth = 'NTLM %s' % ntlm.create_NTLM_NEGOTIATE_MESSAGE("%s\\%s" % (self.domain,self.username))
        request.headers[auth_header] = auth

        # we must keep the connection because NTLM authenticates the connection, not single requests
        request.headers["Connection"] = "Keep-Alive"

        # A streaming response breaks authentication.
        # This can be fixed by not streaming this request, which is safe because
        # the returned response3 will still have stream=True set if specified in
        # args. In addition, we expect this request to give us a challenge
        # and not the real content, so the content will be short anyway.
        args_nostream = dict(args, stream=False)
        response2 = self.adapter.send(request, **args_nostream)

        # this is important for some web applications that store authentication-related info in cookies (it took a long time to figure out)
        if response2.headers.get('set-cookie'):
            request.headers['Cookie'] = response2.headers.get('set-cookie')

        # get the challenge
        auth_header_value = response2.headers[auth_header_field]
        ntlm_header_value = list(filter(lambda s: s.startswith('NTLM '), auth_header_value.split(',')))[0].strip()
        ServerChallenge, NegotiateFlags = ntlm.parse_NTLM_CHALLENGE_MESSAGE(ntlm_header_value[5:])

        # build response
        request = copy_request(request)
        auth = 'NTLM %s' % ntlm.create_NTLM_AUTHENTICATE_MESSAGE(ServerChallenge, self.username, self.domain, self.password, NegotiateFlags)
        request.headers[auth_header] = auth
        
        response3 = self.adapter.send(request, **args)

        # Update the history.
        response3.history.append(response)
        response3.history.append(response2)

        return response3

    def response_hook(self, r, **kwargs):

        if r.status_code == 401 and 'ntlm' in r.headers.get('www-authenticate','').lower():
            return self.retry_using_http_NTLM_auth('www-authenticate',
                                                   'Authorization', r, kwargs)

        if r.status_code == 407 and 'ntlm' in r.headers.get('proxy-authenticate','').lower():
            return self.retry_using_http_NTLM_auth('proxy-authenticate',
                                                   'Proxy-authorization', r,
                                                   kwargs)

        return r

    def __call__(self, r):
        r.register_hook('response', self.response_hook)
        return r
Пример #30
0
from multiprocessing import Manager
from multiprocessing.pool import ThreadPool
from time import sleep
import logging

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib3.exceptions import HTTPError
import numpy as np
import pandas as pd

session = requests.session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

logger = logging.getLogger(__name__)


class AMAPCrawler:
    def __init__(self, keys: (list, tuple, str) = None):
        if keys is None:
            self.keys = ['87a08092f3e9c212e6f06e6327d9f385']
        else:
            if isinstance(keys, str):
                keys = [keys]
            self.keys = keys
Пример #31
0
    def __init__(self,
                 url,
                 method,
                 data=None,
                 kerberos_auth=False,
                 allow_redirects=True,
                 verify_ssl=True,
                 ca=None,
                 use_json=False,
                 headers=None,
                 stream=False,
                 username=None,
                 password=None,
                 client_cert=None,
                 client_key=None,
                 verbose=False,
                 retries_enabled=True):
        def log_error_response_text_hook(resp, *args, **kwargs):
            """requests hook to log error response"""
            if 400 <= resp.status_code <= 599:
                logger.debug('Error response from "%r": "%r"', resp.url,
                             resp.text)

        self.finished = False  # have we read all data?
        self.closed = False  # have we destroyed curl resources?

        self.status_code = 0
        self.headers = None

        retry = Retry(
            total=HTTP_MAX_RETRIES,
            connect=HTTP_MAX_RETRIES,
            backoff_factor=HTTP_BACKOFF_FACTOR,
            status_forcelist=HTTP_RETRIES_STATUS_FORCELIST,
            method_whitelist=HTTP_RETRIES_METHODS_WHITELIST,
            raise_on_status=False,
        )
        self.session = requests.Session()
        self.session.hooks['response'] = [log_error_response_text_hook]

        if retries_enabled:
            self.session.mount('http://', HTTPAdapter(max_retries=retry))
            self.session.mount('https://', HTTPAdapter(max_retries=retry))

        self.url = url
        headers = headers or {}
        method = method.lower()

        if method not in ['post', 'get', 'put', 'patch', 'delete']:
            raise RuntimeError("Unsupported method '%s' for curl call!" %
                               method)

        args = {}

        if method in ['post', 'put', 'patch']:
            headers['Expect'] = ''

        if not verify_ssl:
            args['verify'] = False
        else:
            if ca:
                args['verify'] = ca
            else:
                args['verify'] = True

        if username and password:
            args['auth'] = (username, password)

        if client_cert and client_key:
            args['cert'] = (client_cert, client_key)

        if data:
            args['data'] = data

        if use_json:
            headers['Content-Type'] = 'application/json'

        args['allow_redirects'] = allow_redirects

        if kerberos_auth:
            if not HTTPKerberosAuth:
                raise RuntimeError('Kerberos auth unavailable')
            args['auth'] = HTTPKerberosAuth()

        if stream:
            args['stream'] = True

        args['headers'] = headers
        args['timeout'] = HTTP_REQUEST_TIMEOUT

        self.req = self.session.request(method, url, **args)

        self.headers = self.req.headers
        self.status_code = self.req.status_code
Пример #32
0
def get_bars(symbols_list, outdir, start, end):
    """
        Description: \n
        Pulls the data for aggregated bars for symbols given. \n
        NOTE: These values are unadjusted since splits are adjusted for manually \n

        Parameters: \n
        symbols_list ([str]): list of tickers \n
        outdir (str): folder to put csv file in \n
        start (str): start date to start collecting bar data \n
        end (str): end date to stop collecting bar data \n
    """

    session = requests.Session()
    # In case I run into issues, retry my connection
    retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[ 500, 502, 503, 504 ])

    session.mount('http://', HTTPAdapter(max_retries=retries))
    count = 0
    
    barlog = open("barlog.txt", "w")
    
    for symbol in symbols_list:
        try:
            r = session.get(POLYGON_AGGS_URL.format(symbol, start, end, POLYGON_API_KEY))
            if r:
                data = r.json()
            
                # create a pandas dataframe from the information
                if data['queryCount'] > 0:
                    df = pd.DataFrame(data['results'])
                    
                    df['t'] = pd.to_datetime(df['t'], unit='ms')
                    df['date'] = pd.to_datetime(df['t'], unit='ms')
                    df['date'] =  df['date'].dt.date.astype(str)
                    df.set_index('date', inplace=True)
                    df['symbol'] = symbol

                    df.drop(columns=['vw', 'n'], inplace=True)
                    df.rename(columns={'v': 'volume', 'o': 'open', 'c': 'close', 'h': 'high', 'l': 'low', 't': 'date'}, inplace=True)
                                
                    df.to_csv('{}/{}.csv'.format(outdir, symbol), index=True)
                    count += 1

                    # Logging, I could write a short method for this to reuse
                    msg = (symbol + ' file created with record count ' + str(data['queryCount']))
                    print(msg)
                    barlog.write(msg)
                    barlog.write("\n")

                else:
                    msg = ('No data for symbol ' + str(symbol))
                    print(msg)
                    barlog.write(msg)
                    barlog.write("\n")
            else:
                msg = ('No response for symbol ' + str(symbol))
                print(msg)
                barlog.write(msg)
                barlog.write("\n")
        # Raise exception but continue           
        except:
            msg = ('****** exception raised for symbol ' + str(symbol))
            print(msg)
            barlog.write(msg)
            barlog.write("\n")
    
    barlog.close()
    return ('{} file were exported'.format(count))
Пример #33
0
 def __init__(self, *args, **kwargs):
     self.redirect_source = kwargs.pop("source")
     self.redirect_target = kwargs.pop("target")
     HTTPAdapter.__init__(self, *args, **kwargs)
Пример #34
0
import logging
import time
from concurrent import futures

import pyquery
import requests
from requests.adapters import HTTPAdapter

from libs.request import Request

DEFAULT_POOL_SIZE = 100
HTTP_ADAPTER = HTTPAdapter(pool_connections=DEFAULT_POOL_SIZE, pool_maxsize=DEFAULT_POOL_SIZE + 300)


class BaseCrawler:

    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
        self.session = requests.session()
        self.session.mount('https://', HTTP_ADAPTER)
        self.session.mount('http://', HTTP_ADAPTER)

        self.charset = 'utf-8'
        self.logger = logging.getLogger(self.__class__.__name__)
        self.config = {}
        self.error_request = []

    def crawl(self, iterables, thread=None):
        result_list = []
        n = len(iterables)
Пример #35
0
from pybatfish.datamodel.referencelibrary import (  # noqa: F401
    NodeRoleDimension, NodeRolesData, ReferenceBook)
from pybatfish.settings.issues import IssueConfig  # noqa: F401
from pybatfish.util import BfJsonEncoder
from .options import Options

# suppress the urllib3 warnings due to old version of urllib3 (inside requests)
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

# Setup a session, configure retry policy
_requests_session = requests.Session()
# Prefix "http" will cover both "http" & "https"
_requests_session.mount(
    "http",
    HTTPAdapter(
        max_retries=Retry(connect=Options.max_tries_to_connect_to_coordinator,
                          backoff_factor=Options.request_backoff_factor)))

_encoder = BfJsonEncoder()

__all__ = [
    'add_issue_config', 'add_node_role_dimension', 'add_reference_book',
    'delete_issue_config', 'fork_snapshot', 'get_issue_config', 'get_network',
    'get_node_role_dimension', 'get_node_roles', 'get_reference_book',
    'get_reference_library', 'read_question_settings',
    'write_question_settings'
]


def add_issue_config(session, issue_config):
    # type: (Session, IssueConfig) -> None
Пример #36
0
def get_response(link,
                 retry_params=None,
                 headers=None,
                 timeout=None,
                 proxies=None,
                 session=None):
    """
    get_response gets the responses of the a URL.

    :param link: link to the content to be recieved
    :type link: str
    :param retry_params: rules to retry
    :type retry_params: dict, optional
    :param headers: headers for the request
    :type headers: dict, optional
    :param timeout: timeout parameters for the request
    :type timeout: tuple, optional
    :param proxies: proxies
    :type proxies: dict, optional
    :param session: a session object to be used
    :type session: requests.Session, optional
    :return: response from the url
    :rtype: requests.models.Response
    """

    if retry_params is None:
        retry_params = {}

    retry_params = {
        **{
            'retries': 5,
            'backoff_factor': 0.3,
            'status_forcelist': (500, 502, 504)
        },
        **retry_params
    }

    if headers is None:
        headers = random_user_agent()

    if timeout is None:
        timeout = (5, 14)

    if session is None:
        session = requests.Session()

    if proxies is None:
        proxies = {}

    retry = Retry(
        total=retry_params.get('retries'),
        read=retry_params.get('retries'),
        connect=retry_params.get('retries'),
        backoff_factor=retry_params.get('backoff_factor'),
        status_forcelist=retry_params.get('status_forcelist'),
    )

    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    page = session.get(link, headers=headers, proxies=proxies)

    status = page.status_code

    return page
Пример #37
0
class BetamaxAdapter(BaseAdapter):

    """This object is an implementation detail of the library.

    It is not meant to be a public API and is not exported as such.

    """

    def __init__(self, **kwargs):
        super(BetamaxAdapter, self).__init__()
        self.cassette = None
        self.cassette_name = None
        self.http_adapter = HTTPAdapter(**kwargs)
        self.serialize = None
        self.options = {}

    def cassette_exists(self):
        if self.cassette_name and os.path.exists(self.cassette_name):
            return True
        return False

    def close(self):
        self.http_adapter.close()

    def eject_cassette(self):
        if self.cassette:
            self.cassette.eject()
        self.cassette = None  # Allow self.cassette to be garbage-collected

    def load_cassette(self, cassette_name, serialize, options):
        self.cassette_name = cassette_name
        self.serialize = serialize
        self.options.update(options)
        placeholders = self.options.get('placeholders')

        # load cassette into memory
        if self.cassette_exists():
            self.cassette = Cassette(cassette_name, serialize,
                                     placeholders=placeholders)
        elif os.path.exists(os.path.dirname(cassette_name)):
            self.cassette = Cassette(cassette_name, serialize, 'w+',
                                     placeholders=placeholders)
        else:
            raise RuntimeError(
                'No cassette could be loaded or %s does not exist.' %
                os.path.dirname(cassette_name)
            )

        self.cassette.record_mode = self.options['record']

        re_record_interval = timedelta.max
        if self.options.get('re_record_interval'):
            re_record_interval = timedelta(self.options['re_record_interval'])

        now = datetime.utcnow()
        if re_record_interval < (now - self.cassette.earliest_recorded_date):
            self.cassette.clear()

    def send(self, request, stream=False, timeout=None, verify=True,
             cert=None, proxies=None):
        interaction = None
        match_on = Cassette.default_cassette_options['match_requests_on']
        response = None

        if not self.cassette:
            raise BetamaxError('No cassette was specified or found.')

        if self.cassette.interactions:
            self.cassette.match_options = set(match_on)
            interaction = self.cassette.find_match(request)

        if not interaction and self.cassette.is_recording():
            response = self.http_adapter.send(
                request, stream=True, timeout=timeout, verify=verify,
                cert=cert, proxies=proxies
                )
            self.cassette.save_interaction(response, request)
            interaction = self.cassette.interactions[-1]

        if not interaction:
            raise BetamaxError('A request was made that could not be handled')

        return interaction.as_response()
Пример #38
0
class BetamaxAdapter(BaseAdapter):
    """This object is an implementation detail of the library.

    It is not meant to be a public API and is not exported as such.

    """

    def __init__(self, **kwargs):
        super(BetamaxAdapter, self).__init__()
        self.cassette = None
        self.cassette_name = None
        self.old_adapters = kwargs.pop('old_adapters', {})
        self.http_adapter = HTTPAdapter(**kwargs)
        self.serialize = None
        self.options = {}

    def cassette_exists(self):
        """Check if cassette exists on file system.

        :returns: bool -- True if exists, False otherwise
        """
        if self.cassette_name and os.path.exists(self.cassette_name):
            return True
        return False

    def close(self):
        """Propagate close to underlying adapter."""
        self.http_adapter.close()

    def eject_cassette(self):
        """Eject currently loaded cassette."""
        if self.cassette:
            self.cassette.eject()
        self.cassette = None  # Allow self.cassette to be garbage-collected

    def load_cassette(self, cassette_name, serialize, options):
        """Load cassette.

        Loads a previously serialized http response as a cassette

        :param str cassette_name: (required), name of cassette
        :param str serialize: (required), type of serialization i.e 'json'
        :options dict options: (required), options for cassette
        """
        self.cassette_name = cassette_name
        self.serialize = serialize
        self.options.update(options.items())
        placeholders = self.options.get('placeholders', {})
        cassette_options = {}

        default_options = cassette.Cassette.default_cassette_options

        match_requests_on = self.options.get(
            'match_requests_on', default_options['match_requests_on']
            )

        cassette_options['preserve_exact_body_bytes'] = self.options.get(
            'preserve_exact_body_bytes',
            )

        cassette_options['allow_playback_repeats'] = self.options.get(
            'allow_playback_repeats'
            )

        cassette_options['record_mode'] = self.options.get('record')

        for option, value in list(cassette_options.items()):
            if value is None:
                cassette_options.pop(option)

        self.cassette = cassette.Cassette(
            cassette_name, serialize, placeholders=placeholders,
            cassette_library_dir=self.options.get('cassette_library_dir'),
            **cassette_options
            )

        if 'record' in self.options:
            self.cassette.record_mode = self.options['record']

        # NOTE(sigmavirus24): Cassette.match_options is a set, might as well
        # use that instead of overriding it.
        self.cassette.match_options.update(match_requests_on)

        re_record_interval = timedelta.max
        if self.options.get('re_record_interval'):
            re_record_interval = timedelta(self.options['re_record_interval'])

        now = datetime.utcnow()
        if re_record_interval < (now - self.cassette.earliest_recorded_date):
            self.cassette.clear()

    def send(self, request, stream=False, timeout=None, verify=True,
             cert=None, proxies=None):
        """Send request.

        :param request request: request
        :returns: A Response object
        """
        interaction = None
        current_cassette = self.cassette

        if not current_cassette:
            raise BetamaxError('No cassette was specified or found.')

        if current_cassette.interactions:
            interaction = current_cassette.find_match(request)

        if not interaction and current_cassette.is_recording():
            interaction = self.send_and_record(
                request, stream, timeout, verify, cert, proxies
                )

        if not interaction:
            raise BetamaxError(unhandled_request_message(request,
                                                         current_cassette))

        resp = interaction.as_response()
        resp.connection = self
        return resp

    def send_and_record(self, request, stream=False, timeout=None,
                        verify=True, cert=None, proxies=None):
        """Send request and record response.

        The response will be serialized and saved to a
        cassette which can be replayed in the future.

        :param request request: request
        :param bool stream: (optional) defer download until content is accessed
        :param float timeout: (optional) time to wait for a response
        :param bool verify: (optional) verify SSL certificate
        :param str cert: (optional) path to SSL client
        :param proxies dict: (optional) mapping protocol to URL of the proxy
        :return: Interaction
        :rtype: class:`betamax.cassette.Interaction`
        """
        adapter = self.find_adapter(request.url)
        response = adapter.send(
            request, stream=True, timeout=timeout, verify=verify,
            cert=cert, proxies=proxies
            )
        return self.cassette.save_interaction(response, request)

    def find_adapter(self, url):
        """Find adapter.

        Searches for an existing adapter where the url and prefix match.

        :param url str: (required) url of the adapter
        :returns: betamax adapter
        """
        for (prefix, adapter) in self.old_adapters.items():

            if url.lower().startswith(prefix):
                return adapter
Пример #39
0
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import ssl
from functools import wraps
from config import ConfigSectionMap

cfg = ConfigSectionMap('setup')
START = cfg['start']
END = cfg['end']
session = requests.Session()
retries = Retry(total=5,
                backoff_factor=0.1,
                status_forcelist=[500, 502, 503, 504])
session.proxies = proxies
session.headers = headers
session.mount('https://', HTTPAdapter(max_retries=retries))


def sslwrap(func):
    @wraps(func)
    def bar(*args, **kw):
        kw['ssl_version'] = ssl.PROTOCOL_TLSv1
        return func(*args, **kw)

    return bar


ssl.wrap_socket = sslwrap(ssl.wrap_socket)

output_file = open(cfg['output_file'], 'a', encoding='utf-8')
exception_urls_file = open(cfg['exception_url_file'], 'a', encoding='utf-8')
Пример #40
0
 def __init__(self, base_url):
     self.base_url = base_url
     self.retry_session = requests.Session()
     self.retry_session.mount(self.base_url, HTTPAdapter(
         max_retries=Retry(total=3, status_forcelist=[503])
     ))
Пример #41
0
 def init_poolmanager(self, *args, **kwargs):
     kwargs["ssl_context"] = self.ssl_context
     return HTTPAdapter.init_poolmanager(self, *args, **kwargs)
Пример #42
0
    def product_parse(self, response):
        if len(response.text) < 40000:
            yield scrapy.Request(url=response.request.url,
                                 callback=self.product_parse,
                                 dont_filter=True,
                                 meta=response.meta)
            return None
        item = response.meta['item']
        # 商品链接
        product_url = response.request.url
        # 商品ID
        ProductID = product_url.split('/')[-1].split('.')[0]
        # 商品链接urlID
        urlID = product_url.split('/')[-2]
        # 商品链接urlID
        urlID = product_url.split('/')[-2]
        # 店铺名称
        try:
            shop_name = re.findall('shopName":"(.*?)"', response.text)[0]
        except:
            try:
                shop_name = re.findall('"curShopName":.*?>(.*?)</a>"',
                                       response.text)[0]
            except:
                try:
                    shop_name = response.xpath(
                        ".//div[@class='si-intro-list']/dl[1]/dd/a/text()"
                    ).extract()[0]
                except:
                    shop_name = None
        #去掉shopname中的空白字符
        shop_name = re.sub(r'\r', '', shop_name)
        shop_name = re.sub(r'\t', '', shop_name)
        shop_name = re.sub(r'\n', '', shop_name)
        shop_name = re.sub(r' ', '', shop_name)
        # 商品名称
        try:
            p_Name = response.xpath(
                ".//div[@class='imgzoom-main']/a[@id='bigImg']/img/@alt"
            ).extract()[0]
        except:
            try:
                p_Name = re.findall('"itemDisplayName":"(.*?)"',
                                    response.text)[0]
            except:
                p_Name = None
        #类别
        try:
            X_type = Selector(response).re('"分类":"(.*?)"')[0]
        except:
            try:
                X_type = Selector(response).re(
                    '分类</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    X_type = re.findall('"分类":"(.*?)"', response.text)[0]
                except:
                    X_type = None
        # 品牌
        try:
            brand = Selector(response).re('"brandName":"(.*?)"')[0]
        except:
            try:
                brand = Selector(response).re('<li><b>品牌</b>:(.*?)</li>')[0]
            except:
                try:
                    brand = re.findall('"brandName":"(.*?)"', response.text)[0]
                except:
                    brand = None
        # 去掉品牌括号内容
        if brand:
            if re.findall(r'(.*?)', brand):
                re_com = re.compile('(.*?)')
                brand = brand[:0] + re.sub(re_com, '', brand)
        if brand:
            if re.findall(r'\(.*?\)', brand):
                re_cn = re.compile('\(.*?\)')
                brand = brand[:0] + re.sub(re_cn, '', brand)
        # 颜色
        color = None
        # 类型,商品型号
        try:
            X_name = Selector(response).re(
                '型号</span> </div> </td> <td class="val">(.*?)</td>')[0]
        except:
            try:
                X_name = re.findall(
                    '型号</span> </div> </td> <td class="val">(.*?)</td>',
                    response.text)[0]
                if X_name == None:
                    X_name = re.findall(
                        '型号</span> </div> </td> <td class="val">(.*?)</td>',
                        response.text)[0]
            except:
                X_name = None
        if X_name:
            if brand:
                if brand in X_name:
                    X_name = X_name[:0] + re.sub(brand, '', X_name)
            X_name = X_name[:0] + re.sub(r'(.*?)', '', X_name)
            X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name)
        #安装方式
        try:
            install = Selector(response).re('安装方式:(.*?)</li>')[0]
        except:
            try:
                install = Selector(response).re(
                    '安装方式</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    install = re.findall('安装方式:(.*?)</li>', response.text)[0]
                except:
                    try:
                        install = re.findall(
                            '安装方式</span> </div> </td> <td class="val">(.*?)</td>',
                            response.text)[0]
                    except:
                        install = None
        #是否可以直饮
        try:
            drink = Selector(response).re('是否直饮:(.*?)</li>')[0]
        except:
            try:
                drink = Selector(response).re(
                    '是否直饮</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    drink = re.findall('是否直饮:(.*?)</li>', response.text)[0]
                except:
                    try:
                        drink = re.findall(
                            '是否直饮</span> </div> </td> <td class="val">(.*?)</td>',
                            response.text)[0]
                    except:
                        drink = None
        #滤芯种类
        try:
            kinds = Selector(response).re('滤芯种类:(.*?)</li>')[0]
        except:
            try:
                kinds = Selector(response).re(
                    '滤芯种类</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    kinds = re.findall('滤芯种类:(.*?)</li>', response.text)[0]
                except:
                    try:
                        kinds = re.findall(
                            '滤芯种类</span> </div> </td> <td class="val">(.*?)</td>',
                            response.text)[0]
                    except:
                        kinds = None
        #滤芯使用寿命
        try:
            life = Selector(response).re('滤芯寿命:(.*?)</li>')[0]
        except:
            try:
                life = Selector(response).re(
                    '滤芯寿命</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    life = re.findall('滤芯寿命:(.*?)</li>', response.text)[0]
                except:
                    try:
                        life = re.findall(
                            '滤芯寿命</span> </div> </td> <td class="val">(.*?)</td>',
                            response.text)[0]
                    except:
                        life = None

        #过滤精度
        try:
            precision = Selector(response).re('过滤精度:(.*?)</li>')[0]
        except:
            try:
                precision = Selector(response).re(
                    '过滤精度</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    precision = re.findall('过滤精度:(.*?)</li>', response.text)[0]
                except:
                    try:
                        precision = re.findall(
                            '过滤精度</span> </div> </td> <td class="val">(.*?)</td>',
                            response.text)[0]
                    except:
                        precision = None
        # 核心参数
        type = '"'
        soup = BeautifulSoup(response.text, 'lxml')
        try:
            ul = soup.find('ul', attrs={'class': 'cnt clearfix'})
            li = ul.find_all('li')
            for i in range(len(li)):
                type = type[:] + li[i].text
                if i < len(li) - 1:
                    type = type[:] + ' '
                if i == len(li) - 1:
                    type = type[:] + '"'
        except:
            try:  # 部分核心参数格式更改
                div = soup.find('div', class_='prod-detail-container')
                ul = div.find('ul', attrs={'class': 'clearfix'})
                li = ul.find_all('li')
                for each in li:
                    li_li = each.find_all('li')
                    for i in range(len(li_li)):
                        type = type[:] + li_li[i].text
                        if i < len(li_li) - 1:
                            type = type[:] + ' '
                        if i == len(li_li) - 1:
                            type = type[:] + '"'
            except:
                type = None
        if type:
            if len(type) < 2:
                type = None
        if type == None:
            try:
                parameter_id = Selector(response).re(
                    '"mainPartNumber":"(.*?)"')[0]
            except:
                try:
                    parameter_id = re.findall('"mainPartNumber":"(.*?)"',
                                              response.text)[0]
                except:
                    parameter_id = None
                    type = None
            if parameter_id:
                try:
                    parameter_id = Selector(response).re(
                        '"mainPartNumber":"(.*?)"')[0]
                    parameter_url = 'https://product.suning.com/pds-web/ajax/itemParameter_%s_R0105002_10051.html' % parameter_id
                    para_response = requests.get(parameter_url).text
                    time.sleep(0.3)
                    eles = re.findall('"snparameterdesc":"(.*?)"',
                                      para_response)
                    souls = re.findall('"snparameterVal":"(.*?)"',
                                       para_response)
                    try:
                        type = '"'
                        for i in range(len(eles)):
                            type = type[:] + eles[i] + ':' + souls[i]
                            if i < len(eles) - 1:
                                type = type[:] + ' '
                            if i == len(eles) - 1:
                                type = type[:] + '"'
                            if len(type) < 2:
                                type = None
                    except:
                        type = None
                    if brand == None:
                        try:
                            brand = re.findall(
                                '"snparameterdesc":"品牌","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            brand = None
                    try:
                        X_name = re.findall(
                            '"snparameterdesc":"型号","snparameterVal":"(.*?)"',
                            para_response)[0]
                    except:
                        X_name = None
                    if X_name:
                        if brand:
                            if brand in X_name:
                                X_name = X_name[:0] + re.sub(brand, '', X_name)
                        X_name = X_name[:0] + re.sub(r'(.*?)', '', X_name)
                        X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name)
                    #类别
                    if X_type == None:
                        try:
                            X_type = re.findall(
                                '"snparameterdesc":"分类","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            X_type = None
                    #安装方式
                    if install == None:
                        try:
                            install = re.findall(
                                '"snparameterdesc":"安装方式","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            install = None
                    #是否直饮
                    if drink == None:
                        try:
                            drink = re.findall(
                                '"snparameterdesc":"是否直饮","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            drink = None
                    #滤芯种类
                    if kinds == None:
                        try:
                            kinds = re.findall(
                                '"snparameterdesc":"滤芯种类","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            kinds = None
                    #滤芯使用寿命
                    if life == None:
                        try:
                            life = re.findall(
                                '"snparameterdesc":"滤芯寿命","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            life = None
                    #过滤精度
                    if precision == None:
                        try:
                            precision = re.findall(
                                '"snparameterdesc":"过滤精度","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            precision = None
                except:
                    pass
        # 获取相关请求url
        keyword_url = 'https://review.suning.com/ajax/getreview_labels/general-000000000' + ProductID + '-' + urlID + '-----commodityrLabels.htm'
        comment_url = 'https://review.suning.com/ajax/review_satisfy/general-000000000' + ProductID + '-' + urlID + '-----satisfy.htm'
        price_url = 'https://pas.suning.com/nspcsale_0_000000000' + ProductID + '_000000000' + ProductID + '_' + urlID + '_10_010_0100101_20268_1000000_9017_10106_Z001.html'
        # 获取印象关键字
        try:
            keyword_response = requests.get(keyword_url).text
            keyword_text = json.loads(
                re.findall(r'\((.*?)\)', keyword_response)[0])
            keyword_list = keyword_text.get('commodityLabelCountList')
            key_str = '"'
            keyword = []
            for i in range(len(keyword_list)):
                key_str = key_str[:] + keyword_list[i].get('labelName')
                if i < len(keyword_list) - 1:
                    key_str = key_str[:] + ' '
                if i == len(keyword_list) - 1:
                    key_str = key_str[:] + '"'
            keyword.append(key_str)
        except:
            keyword = None
        # 获取评价信息
        try:
            comment_response = requests.get(comment_url).text
            comment_text = json.loads(
                re.findall(r'\((.*?)\)', comment_response)[0])
            comment_list = comment_text.get('reviewCounts')[0]
            # 差评
            PoorCount = comment_list.get('oneStarCount')
            twoStarCount = comment_list.get('twoStarCount')
            threeStarCount = comment_list.get('threeStarCount')
            fourStarCount = comment_list.get('fourStarCount')
            fiveStarCount = comment_list.get('fiveStarCount')
            # 评论数量
            CommentCount = comment_list.get('totalCount')
            # 好评
            GoodCount = fourStarCount + fiveStarCount
            # 中评
            GeneralCount = twoStarCount + threeStarCount
            # 好评度
            # 得到百分比取整函数
            if CommentCount != 0:
                goodpercent = round(GoodCount / CommentCount * 100)
                generalpercent = round(GeneralCount / CommentCount * 100)
                poorpercent = round(PoorCount / CommentCount * 100)
                commentlist = [GoodCount, GeneralCount, PoorCount]
                percent_list = [goodpercent, generalpercent, poorpercent]
                # 对不满百分之一的判定
                for i in range(len(percent_list)):
                    if percent_list[i] == 0 and commentlist[
                            i] != 0 and CommentCount != 0:
                        percent_list[i] = 1
                nomaxpercent = 0  # 定义为累计不是最大百分比数值
                # 好评度计算url='http://res.suning.cn/project/review/js/reviewAll.js?v=20170823001'
                if CommentCount != 0:
                    maxpercent = max(goodpercent, generalpercent, poorpercent)
                    for each in percent_list:
                        if maxpercent != each:
                            nomaxpercent += each
                    GoodRateShow = 100 - nomaxpercent
                else:
                    GoodRateShow = 100
            else:
                PoorCount = 0
                CommentCount = 0
                GoodCount = 0
                GeneralCount = 0
                GoodRateShow = 100
        except:
            PoorCount = 0
            CommentCount = 0
            GoodCount = 0
            GeneralCount = 0
            GoodRateShow = 100
        # 有关价格
        try:
            price_response = requests.get(price_url).text
        except requests.RequestException as e:
            # print(e)
            time.sleep(2)
            s = requests.session()
            s.keep_alive = False
            s.mount('https://', HTTPAdapter(max_retries=5))
            price_response = s.get(price_url).text
        if len(price_response) > 900:
            try:
                price = re.findall('"refPrice":"(.*?)"', price_response)[0]
                PreferentialPrice = re.findall('"promotionPrice":"(.*?)"',
                                               price_response)[0]
                if len(price) < 1:
                    price = re.findall('"netPrice":"(.*?)"', price_response)[0]
                if price:
                    if float(price) < float(PreferentialPrice):
                        tt = price
                        price = PreferentialPrice
                        PreferentialPrice = tt
            except:
                price = None
                PreferentialPrice = None
        else:
            time.sleep(3)
            price_response = requests.get(price_url).text
            if len(price_response) > 900:
                try:
                    price = re.findall('"refPrice":"(.*?)"', price_response)[0]
                    PreferentialPrice = re.findall('"promotionPrice":"(.*?)"',
                                                   price_response)[0]
                    if len(price) < 1:
                        price = re.findall('"netPrice":"(.*?)"',
                                           price_response)[0]
                    if price:
                        if float(price) < float(PreferentialPrice):
                            tt = price
                            price = PreferentialPrice
                            PreferentialPrice = tt
                except:
                    price = None
                    PreferentialPrice = None
            else:
                # 作出失败判断并将url归入重试
                price_response = self.retry_price(price_url)
                if len(price_response) > 500:
                    try:
                        price = re.findall('"refPrice":"(.*?)"',
                                           price_response)[0]
                        PreferentialPrice = re.findall(
                            '"promotionPrice":"(.*?)"', price_response)[0]
                        if len(price) < 1:
                            price = re.findall('"netPrice":"(.*?)"',
                                               price_response)[0]
                        if price:
                            if float(price) < float(PreferentialPrice):
                                tt = price
                                price = PreferentialPrice
                                PreferentialPrice = tt
                    except:
                        price = None
                        PreferentialPrice = None
                else:
                    PreferentialPrice = None
                    price = None
        if kinds:
            if re.findall(r'\d', kinds) and len(kinds) < 3:
                level = kinds
                kinds = None
            else:
                level = None
        else:
            level = None
        # 防止出现多个字段出现为空
        if p_Name == None and brand == None and type == None:
            yield None
        else:
            source = '苏宁'
            item['shop_name'] = shop_name
            item['p_Name'] = p_Name
            item['X_name'] = X_name
            item['type'] = type
            item['price'] = price
            item['PreferentialPrice'] = PreferentialPrice
            item['brand'] = brand
            item['keyword'] = keyword
            item['PoorCount'] = PoorCount
            item['CommentCount'] = CommentCount
            item['GoodCount'] = GoodCount
            item['GeneralCount'] = GeneralCount
            item['GoodRateShow'] = GoodRateShow
            item['install'] = install
            item['drink'] = drink
            item['source'] = source
            item['level'] = level
            item['kinds'] = kinds
            item['life'] = life
            item['precision'] = precision
            item['color'] = color
            item['product_url'] = product_url
            item['ProductID'] = ProductID
            item['X_type'] = X_type
            yield item
Пример #43
0
class HttpNtlmAuth(AuthBase):
    """HTTP NTLM Authentication Handler for Requests. Supports pass-the-hash."""

    def __init__(self, username, password):
        """
            :username   - Username in 'domain\\username' format
            :password   - Password or hash in "ABCDABCDABCDABCD:ABCDABCDABCDABCD" format.
        """
        if ntlm is None:
            raise Exception("NTLM libraries unavailable")
        #parse the username
        user_parts = username.split('\\', 1)
        self.domain = user_parts[0].upper()
        self.username = user_parts[1]

        self.password = password
        self.adapter = HTTPAdapter()


    def retry_using_http_NTLM_auth(self, auth_header_field, auth_header, response):
        """Attempts to authenticate using HTTP NTLM challenge/response"""

        if auth_header in response.request.headers:
            return response

        request = response.request
        # initial auth header with username. will result in challenge
        auth = 'NTLM %s' % ntlm.create_NTLM_NEGOTIATE_MESSAGE("%s\\%s" % (self.domain,self.username))
        request.headers[auth_header] = auth

        # we must keep the connection because NTLM authenticates the connection, not single requests
        request.headers["Connection"] = "Keep-Alive"

        response2 = self.adapter.send(request)

        # this is important for some web applications that store authentication-related info in cookies (it took a long time to figure out)
        if response2.headers.get('set-cookie'):
            headers['Cookie'] = response2.headers.get('set-cookie')

        # get the challenge
        auth_header_value = response2.headers[auth_header_field]
        ServerChallenge, NegotiateFlags = ntlm.parse_NTLM_CHALLENGE_MESSAGE(auth_header_value[5:])

        # build response
        auth = 'NTLM %s' % ntlm.create_NTLM_AUTHENTICATE_MESSAGE(ServerChallenge, self.username, self.domain, self.password, NegotiateFlags)
        request.headers[auth_header] = auth
        request.headers["Connection"] = "Close"

        response = self.adapter.send(request)

        return response


    def response_hook(self,r):

        if r.status_code == 401 and 'ntlm' in r.headers.get('www-authenticate','').lower():
            return self.retry_using_http_NTLM_auth('www-authenticate', 'Authorization', r)

        if r.status_code == 407 and 'ntlm' in r.headers.get('proxy-authenticate','').lower():
            return self.retry_using_http_NTLM_auth('proxy-authenticate', 'Proxy-authorization', r)

        return r


    def __call__(self,r):
        r.register_hook('response', self.response_hook)
        return r
Пример #44
0
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup as bs
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
outputfile = open('output.txt', 'w')
s = requests.Session()
s.mount('https://', HTTPAdapter(max_retries=5))
urls = [
    'https://www.changechecker.org/search-results.aspx?denominationId=1&subcategory=Year&subcategoryId=1000',
    'https://www.changechecker.org/search-results.aspx?denominationId=3&subcategory=Year&subcategoryId=1000',
    'https://www.changechecker.org/search-results.aspx?denominationId=4&subcategory=Sport&subcategoryId=1010',
    'https://www.changechecker.org/search-results.aspx?denominationId=15&subcategory=Letter&subcategoryId=-1',
    'https://www.changechecker.org/search-results?denominationId=9&subcategory=Year&subcategoryId=1000',
    'https://www.changechecker.org/search-results.aspx?denominationId=16&subcategory=Definitive&subcategoryId=1020',
    'https://www.changechecker.org/search-results.aspx?denominationId=7&subcategory=Year&subcategoryId=1000',
    'https://www.changechecker.org/search-results.aspx?denominationId=5&subcategory=Year&subcategoryId=1000',
    'https://www.changechecker.org/search-results.aspx?denominationId=2&subcategory=Year&subcategoryId=1000',
    'https://www.changechecker.org/search-results.aspx?denominationId=6&subcategory=Year&subcategoryId=1000'
]
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
}
for url in urls:
    urldata = s.get(url, headers=headers)
    soup = bs(urldata.text, 'html.parser')
    elements = soup.find_all('div', class_='divCoinBackground')
    coinlinks = []
    for element in elements:
        coinlink = 'https://www.changechecker.org/' + element.a['href']
Пример #45
0
def download_result_file(url, result_file_directory, result_file_name, decompress, overwrite):
    """ Download file with specified URL and download parameters.

    :param result_file_directory: The download result local directory name.
    :type result_file_directory: str
    :param result_file_name: The download result local file name.
    :type result_file_name: str
    :param decompress: Determines whether to decompress the ZIP file.
                        If set to true, the file will be decompressed after download.
                        The default value is false, in which case the downloaded file is not decompressed.
    :type decompress: bool
    :param overwrite: Indicates whether the result file should overwrite the existing file if any.
    :type overwrite: bool
    :return: The download file path.
    :rtype: str
    """

    if result_file_directory is None:
        raise ValueError('result_file_directory cannot be None.')

    if result_file_name is None:
        result_file_name="default_file_name"

    if decompress:
        name, ext=os.path.splitext(result_file_name)
        if ext == '.zip':
            raise ValueError("Result file can't be decompressed into a file with extension 'zip'."
                                " Please change the extension of the result_file_name or pass decompress=false")
        zip_file_path=os.path.join(result_file_directory, name + '.zip')
        result_file_path=os.path.join(result_file_directory, result_file_name)
    else:
        result_file_path=os.path.join(result_file_directory, result_file_name)
        zip_file_path=result_file_path

    if os.path.exists(result_file_path) and overwrite is False:
        if six.PY3:
            raise FileExistsError('Result file: {0} exists'.format(result_file_path))
        else:
            raise OSError('Result file: {0} exists'.format(result_file_path))
    
    pool_manager=PoolManager(
        ssl_version=ssl.PROTOCOL_SSLv3,
    )
    http_adapter=HTTPAdapter()
    http_adapter.poolmanager=pool_manager
    
    s=requests.Session()
    s.mount('https://', http_adapter)
    r=s.get(url, stream=True, verify=True)
    r.raise_for_status()
    try:
        with open(zip_file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=4096):
                if chunk:
                    f.write(chunk)
                    f.flush()
        if decompress:
            with contextlib.closing(zipfile.ZipFile(zip_file_path)) as compressed:
                first=compressed.namelist()[0]
                with open(result_file_path, 'wb') as f:
                    f.write(compressed.read(first))
    except Exception as ex:
        raise ex
    finally:
        if decompress and os.path.exists(zip_file_path):
            os.remove(zip_file_path)
    return result_file_path
Пример #46
0
 def mount(self, pool_connections=5, pool_maxsize=120):
     self.request.mount('https://',
                        HTTPAdapter(pool_connections, pool_maxsize))
Пример #47
0
 def __init__(self, host='localhost', port=1317, tls=False):
     self.host = host
     self.port = port
     self.tls = tls
     self.session = requests.Session()
     self.session.mount(self.host, HTTPAdapter(max_retries=MAX_RETRIES))
Пример #48
0
    subs = "%s"
else:
    warn("No recognized database specified: {}".format(args.database))
    sys.exit()

format_url = "http://eol.org/data_objects/{}"
rating_subexp = r"(?:.+?(\d) star.+?^(\d+))"
rating_regexp = r"^<dl class=.rating_counts.>{0}{0}{0}{0}{0}".format(
    rating_subexp)

rating_flags = re.MULTILINE | re.DOTALL
sess = requests.Session()
retries = Retry(total=args.retries,
                backoff_factor=1,
                status_forcelist=[500, 502, 503, 504])
sess.mount('http://', HTTPAdapter(max_retries=retries))

db_curs = db_connection.cursor()
for im_tab in args.images_table:
    db_curs.execute(
        "SELECT DISTINCT src_id FROM {} WHERE src_id IS NOT NULL AND src = {} ORDER BY (rating_confidence IS NOT NULL), updated DESC;"
        .format(im_tab, subs), args.eol_src_flag)
    rows = db_curs.fetchall()
    for i, row in enumerate(rows):
        EOL_data_object_id = int(row[0])
        if args.verbosity:
            print("{}: looking for ratings for EoL data object {}".format(
                i, EOL_data_object_id))
        response = sess.get(format_url.format(EOL_data_object_id), timeout=10)
        m = re.search(rating_regexp, response.text, rating_flags)
        if m:
Пример #49
0
"""
from typing import Any, Dict, List

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter, Retry

MAX_RETRIES = 16

retry_strategy = Retry(
    total=MAX_RETRIES,
    status_forcelist=[429, 500, 502, 503, 504],
)

SESSION = requests.Session()
SESSION.mount("http://", HTTPAdapter(max_retries=retry_strategy))
SESSION.mount("https://", HTTPAdapter(max_retries=retry_strategy))


class FetchDemandError(Exception):
    """
    Object for demand fetching exceptions.
    """

    # pylint: disable=unnecessary-pass
    pass


def get_dates(season: str) -> List[str]:
    """
    Get dates with available course demand.
Пример #50
0
import os
import time
from multiprocessing import Process

# gevent.monkey.patch_all()
reload(sys)

headers = dict()
headers[
    "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
headers[
    "Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
headers["Accept-Encoding"] = "gzip, deflate, sdch"
headers["Accept-Language"] = "zh-CN,zh;q=0.8"
headers["Accept-Language"] = "zh-CN,zh;q=0.8"
request_retry = HTTPAdapter(max_retries=3)


def my_get(url, refer=None):
    session = requests.session()
    session.headers = headers
    if refer:
        headers["Referer"] = refer
    session.mount('https://', request_retry)
    session.mount('http://', request_retry)
    return session.get(url)


def get_type_content(page):
    if page < 2:
        url = 'http://www.netbian.com/index.htm'
Пример #51
0
    def _rest_request(self,
                      url,
                      method,
                      params=None,
                      body=None,
                      fullresponse=False,
                      use_base_url=True):
        # base request method for a REST request
        myheaders = {"User-Agent": "api.py"}
        if method in ["POST", "PUT"]:
            myheaders.update({'Content-type': 'application/json'})

        retry_strategy = Retry(total=3,
                               status_forcelist=[429, 500, 502, 503, 504],
                               method_whitelist=["HEAD", "GET", "OPTIONS"])
        session = requests.Session()
        session.mount(self.base_rest_url,
                      HTTPAdapter(max_retries=retry_strategy))

        if use_base_url:
            url = self.base_rest_url + url

        try:
            if method == "GET":
                request = requests.Request(
                    method,
                    url,
                    params=params,
                    auth=RequestsAuthPluginVeracodeHMAC(),
                    headers=myheaders)
                prepared_request = request.prepare()
                r = session.send(prepared_request, proxies=self.proxies)
            elif method == "POST":
                r = requests.post(url,
                                  params=params,
                                  auth=RequestsAuthPluginVeracodeHMAC(),
                                  headers=myheaders,
                                  data=body)
            elif method == "PUT":
                r = requests.put(url,
                                 params=params,
                                 auth=RequestsAuthPluginVeracodeHMAC(),
                                 headers=myheaders,
                                 data=body)
            elif method == "DELETE":
                r = requests.delete(url,
                                    params=params,
                                    auth=RequestsAuthPluginVeracodeHMAC(),
                                    headers=myheaders)
            else:
                raise VeracodeAPIError("Unsupported HTTP method")
        except requests.exceptions.RequestException as e:
            logger.exception(self.connect_error_msg)
            raise VeracodeAPIError(e)

        if not (r.status_code == requests.codes.ok):
            logger.debug(
                "API call returned non-200 HTTP status code: {}".format(
                    r.status_code))

        if not (r.ok):
            logger.debug("Error retrieving data. HTTP status code: {}".format(
                r.status_code))
            if r.status_code == 401:
                logger.exception(
                    "Error [{}]: {} for request {}. Check that your Veracode API account credentials are correct."
                    .format(r.status_code, r.text, r.request.url))
            else:
                logger.exception("Error [{}]: {} for request {}".format(
                    r.status_code, r.text, r.request.url))
            raise requests.exceptions.RequestException()

        if fullresponse:
            return r
        elif r.text != "":
            return r.json()
        else:
            return ""
Пример #52
0
 def __init__(self, username, password):
     self.auth = (username, password)
     self.http = HTTPAdapter()