def handle_401(self, response, **kwargs): """Takes the given response and tries digest-auth, if needed.""" original_request = response.request.copy() www_authenticate = response.headers.get('www-authenticate', '').lower() www_auth_schemes = [x.strip().split()[0] for x in www_authenticate.split(',') if x.strip()] auths_to_try = [x for x in www_auth_schemes if x in [y.lower() for y in self.auth_map.keys()]] for auth_scheme in auths_to_try: for auth_instance in self.auth_map[auth_scheme]: #print 'trying', auth_instance, 'for', auth_scheme # Consume content and release the original connection # to allow our new request to reuse the same one. response.content response.raw.release_conn() prepared_request = original_request.copy() prepared_request.hooks = default_hooks() prepared_request.prepare_auth(auth_instance) adapter = HTTPAdapter() if self.session: adapter = self.session() or adapter new_response = adapter.send(prepared_request, **kwargs) new_response.history.append(response) new_response.request = prepared_request if new_response.status_code != 401: #print auth_instance, 'successful for', auth_scheme self.current_auth = auth_instance return new_response response = new_response return response
def __init__(self, config): self.config = config self.pool_manager = requests.Session() self.retry_methods = frozenset(['GET', 'HEAD', 'DELETE', 'OPTIONS']) # noinspection PyTypeChecker adapter = HTTPAdapter( pool_connections=config.http_pool_connections, pool_maxsize=config.http_pool_size, # max_retries=Retry( # method_whitelist=self.retry_methods, # total=config.http_max_retries, # connect=config.http_max_retries, # read=config.http_max_retries, # status_forcelist=range(500, 600) # ), # pool_block=True ) adapter.max_retries = config.http_max_retries self.pool_manager.mount('https://', adapter) self.pool_manager.mount('http://', adapter) self.pool_manager.verify = bool(self.config.verify_ssl)
def _on_request(self, request, **kwargs): match = self._find_match(request) # TODO(dcramer): find the correct class for this if match is None: raise ConnectionError('Connection refused') headers = { 'Content-Type': match['content_type'], } if match['adding_headers']: headers.update(match['adding_headers']) response = HTTPResponse( status=match['status'], body=StringIO(match['body']), headers=headers, preload_content=False, ) adapter = HTTPAdapter() r = adapter.build_response(request, response) if not match['stream']: r.content # NOQA return r
def _on_request(self, request, **kwargs): match = self._find_match(request) # TODO(dcramer): find the correct class for this if match is None: error_msg = 'Connection refused: {0}'.format(request.url) response = ConnectionError(error_msg) self._calls.add(request, response) raise response headers = { 'Content-Type': match['content_type'], } if match['adding_headers']: headers.update(match['adding_headers']) response = HTTPResponse( status=match['status'], body=BufferIO(match['body']), headers=headers, preload_content=False, ) adapter = HTTPAdapter() response = adapter.build_response(request, response) if not match['stream']: response.content # NOQA self._calls.add(request, response) return response
def send(self): self.url = "%s%s" % (self.base_url, self.path) prepped = self.prepare() s = Session() # print(self.data) h = HTTPAdapter() h.max_retries = 10 s.mount('http://', h) s.mount('https://', h) response = s.send(prepped) response.needs_user_token = self.needs_user_token response.original_request = self return response
def __init__(self, app_id): self.app_id = app_id # Provides cookie persistence, connection-pooling, and configuration. self.session = requests.Session() # Create an requests HTTP adapter and set number of retries to attempt adapter = HTTPAdapter() adapter.max_retries = 5 # Register transport adapter for given URL prefix and enable connection retrying. self.session.mount(self.API_URL_PREFIX, adapter=adapter)
def send(self, request, **kwargs): if (self._is_cache_disabled or request.method not in self._cache_allowable_methods): response = super(CachedSession, self).send(request, **kwargs) response.from_cache = False return response cache_key = self.cache.create_key(request) def send_request_and_cache_response(): if self._deny_outbound: print(request.url) raise Exception(("ERROR: OutBound communication was attempted," " but deny_outbound was set to True")) cache_response = True response = super(CachedSession, self).send(request, **kwargs) if response.status_code in self._cache_allowable_codes: # # Special case for cblr: # if we get a status of pending then don't cache # try: if request.url.find('cblr') != -1 and request.method == 'GET': if isinstance(response.json(), dict) and response.json().get('status', '') == 'pending': cache_response = False except: cache_response = True if cache_response: self.cache.save_response(cache_key, response) response.from_cache = False return response response = self.cache.get_response(cache_key) if response is None: return send_request_and_cache_response() if 'Content-Encoding' in response.headers: del response.headers['Content-Encoding'] adapter = HTTPAdapter() response = adapter.build_response(request, response) # dispatch hook here, because we've removed it before pickling response.from_cache = True response = dispatch_hook('response', request.hooks, response, **kwargs) return response
def __init__(self, **kwargs): super(BetamaxAdapter, self).__init__() self.cassette = None self.cassette_name = None self.http_adapter = HTTPAdapter(**kwargs) self.serialize = None self.options = {}
def _on_request(self, request, **kwargs): match = self._find_match(request) # TODO(dcramer): find the correct class for this if match is None: error_msg = 'Connection refused: {0}'.format(request.url) response = ConnectionError(error_msg) self._calls.add(request, response) raise response if 'body' in match and isinstance(match['body'], Exception): self._calls.add(request, match['body']) raise match['body'] headers = { 'Content-Type': match['content_type'], } if 'callback' in match: # use callback status, r_headers, body = match['callback'](request) body = BufferIO(body.encode('utf-8')) headers.update(r_headers) elif 'body' in match: if match['adding_headers']: headers.update(match['adding_headers']) status = match['status'] body = BufferIO(match['body']) response = HTTPResponse( status=status, body=body, headers=headers, preload_content=False, ) adapter = HTTPAdapter() response = adapter.build_response(request, response) if not match.get('stream'): response.content # NOQA self._calls.add(request, response) return response
def __init__(self, username, password): """ :username - Username in 'domain\\username' format :password - Password or hash in "ABCDABCDABCDABCD:ABCDABCDABCDABCD" format. """ if ntlm is None: raise Exception("NTLM libraries unavailable") #parse the username user_parts = username.split('\\', 1) self.domain = user_parts[0].upper() self.username = user_parts[1] self.password = password self.adapter = HTTPAdapter()
def __init__(self, username, password): """ :username - Username in 'domain\\username' format :password - Password or hash in "ABCDABCDABCDABCD:ABCDABCDABCDABCD" format. """ if ntlm is None: raise Exception("NTLM libraries unavailable") #parse the username try: self.domain, self.username = username.split('\\', 1) except ValueError: raise ValueError("username should be in 'domain\\username' format.") self.domain = self.domain.upper() self.password = password self.adapter = HTTPAdapter()
def get_connection(self, *args, **kwargs): conn = HTTPAdapter.get_connection(self, *args, **kwargs) # Override the urlopen method on this connection if not hasattr(conn.urlopen, "wrapped"): orig_urlopen = conn.urlopen def urlopen(*args, **kwargs): timeout = kwargs.pop("timeout", None) if isinstance(timeout, Timeout): timeout = Timeout.from_float(timeout.connect_timeout) return orig_urlopen(*args, timeout=timeout, **kwargs) conn.urlopen = urlopen conn.urlopen.wrapped = True return conn
class Foauth(BaseAdapter): """The foauth.org transport adapter.""" def __init__(self, username, password): self.auth = (username, password) self.http = HTTPAdapter() def prepare_request(self, request): p = urlparse(request.url) # Rewrite the url to use foauth.org request.url = FOAUTH_TEMPLATE.format(domain=p.netloc, path=p.path) # Authenticate appropriately. request.prepare_auth(self.auth) return request def send(self, request, **kwargs): request = self.prepare_request(request) return self.http.send(request, **kwargs)
def proxy_manager_for(self, *args, **kwargs): kwargs["ssl_context"] = self.ssl_context return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
def upload( self, parts: Union[VideoPart, List[VideoPart]], title: str, tid: int, tag: List[str], desc: str, source='', cover='', no_reprint: bool = True, dynamic='', dtime=None, open_elec: bool = True, open_subtitle: bool = True, max_retry=5, ): """ :param parts: 视频列表 VideoPart('part path', 'part title', 'part desc'), 或者 [VideoPart(...), VideoPart(...)] :type parts: Union[VideoPart, List[VideoPart]] :param title: 视频标题 :type title: str :param tid: 视频分区ID, 参考: https://member.bilibili.com/x/web/archive/pre 或者https://github.com/uupers/BiliSpider/wiki/%E8%A7%86%E9%A2%91%E5%88%86%E5%8C%BA%E5%AF%B9%E5%BA%94%E8%A1%A8 :type tid: int :param tag: 视频tag :type tag: List[str] :param desc: 视频简介 :type desc: str :param dtime: 定时发布的时间戳,可选(optional) publish date timestamp (10 digits Unix timestamp e.g. 1551533438) :type dtime: int :param source: 可选:(optional) 转载地址 :type source: str :param cover: 可选:视频封面(optional) cover's URL, use method *cover_up* to get :type cover: str :param no_reprint: 可选:是否允许转载(optional) Is reprint allowed :type no_reprint: bool :param dynamic: 粉丝动态 :type dynamic: str :param open_elec: 可选:是否开启充电(optional) whether to open charging panel :type open_elec: bool :param open_subtitle: 可选:是否允许上传字幕(optional) Is uploading subtitles allowed :type open_subtitle: bool :param max_retry: 可选:每块最大重试时间(optional) max retry times per chunk :type max_retry: int """ if len(title) > 80: raise Exception("标题长度超过80字") if len(source) > 200: raise Exception("转载地址长度超过200字") self.session.headers[ 'Content-Type'] = 'application/json; charset=utf-8' if not isinstance(parts, list): parts = [parts] # retry by status retries = Retry( total=max_retry, backoff_factor=1, status_forcelist=(504, ), ) self.session.mount('https://', HTTPAdapter(max_retries=retries)) self.session.mount('http://', HTTPAdapter(max_retries=retries)) # videos = [] for part in parts: filepath = part.path filename = os.path.basename(filepath) filesize = os.path.getsize(filepath) r = self.session.get( 'https://member.bilibili.com/preupload?' 'os=upos&upcdn=ws&name={name}&size={size}&r=upos&profile=ugcupos%2Fyb&ssl=0' .format(name=parse.quote_plus(filename), size=filesize)) """return example { "upos_uri": "upos://ugc/i181012ws18x52mti3gg0h33chn3tyhp.mp4", "biz_id": 58993125, "endpoint": "//upos-hz-upcdnws.acgvideo.com", "endpoints": [ "//upos-hz-upcdnws.acgvideo.com", "//upos-hz-upcdntx.acgvideo.com" ], "chunk_retry_delay": 3, "chunk_retry": 200, "chunk_size": 4194304, "threads": 2, "timeout": 900, "auth": "os=upos&cdn=upcdnws&uid=&net_state=4&device=&build=&os_version=&ak=×tamp=&sign=", "OK": 1 } """ json = r.json() upos_uri = json['upos_uri'] endpoint = json['endpoint'] auth = json['auth'] biz_id = json['biz_id'] chunk_size = json['chunk_size'] self.session.headers['X-Upos-Auth'] = auth # add auth header r = self.session.post('https:{}/{}?uploads&output=json'.format( endpoint, upos_uri.replace('upos://', ''))) # {"upload_id":"72eb747b9650b8c7995fdb0efbdc2bb6","key":"\/i181012ws2wg1tb7tjzswk2voxrwlk1u.mp4","OK":1,"bucket":"ugc"} json = r.json() upload_id = json['upload_id'] with open(filepath, 'rb') as f: chunks_num = math.ceil(filesize / chunk_size) chunks_index = -1 while True: chunks_data = f.read(chunk_size) if not chunks_data: break chunks_index += 1 # start with 0 def upload_chunk(): r = self.session.put( 'https:{endpoint}/{upos_uri}?' 'partNumber={part_number}&uploadId={upload_id}&chunk={chunk}&chunks={chunks}&size={size}&start={start}&end={end}&total={total}' .format( endpoint=endpoint, upos_uri=upos_uri.replace('upos://', ''), part_number=chunks_index + 1, # starts with 1 upload_id=upload_id, chunk=chunks_index, chunks=chunks_num, size=len(chunks_data), start=chunks_index * chunk_size, end=chunks_index * chunk_size + len(chunks_data), total=filesize, ), chunks_data, ) return r def retry_upload_chunk(): """return :class:`Response` if upload success, else return None.""" for i in range(max_retry): r = upload_chunk() if r.status_code == 200: return r log.info(r.text) log.info('{}/{} retry stage {}/{}'.format( chunks_index, chunks_num, i, max_retry)) log.info('sleep %ds', 5 * i) time.sleep(5 * i) return None r = retry_upload_chunk() if r: log.info('upload part {}/{}'.format( chunks_index, chunks_num)) else: raise Exception( 'upload reach max retry times at part {}/{}'. format(chunks_index, chunks_num)) # NOT DELETE! Refer to https://github.com/comwrg/bilibiliupload/issues/15#issuecomment-424379769 self.session.post( 'https:{endpoint}/{upos_uri}?' 'output=json&name={name}&profile=ugcupos%2Fyb&uploadId={upload_id}&biz_id={biz_id}' .format( endpoint=endpoint, upos_uri=upos_uri.replace('upos://', ''), name=filename, upload_id=upload_id, biz_id=biz_id, ), { "parts": [{ "partNumber": i, "eTag": "etag" } for i in range(1, chunks_num + 1)] }, ) videos.append({ 'filename': upos_uri.replace('upos://ugc/', '').split('.')[0], 'title': part.title, 'desc': part.desc }) # if source is empty, copyright=1, else copyright=2 copyright = 2 if source else 1 def add(): r = self.session.post( 'https://member.bilibili.com/x/vu/web/add?csrf=' + self.csrf, json={ "copyright": copyright, "source": source, "title": title, "tid": tid, "tag": ','.join(tag), "no_reprint": int(no_reprint), "desc": desc, "cover": cover, "mission_id": 0, "order_id": 0, "videos": videos, "dtime": dtime, "open_elec": int(open_elec), "dynamic": dynamic, "subtitle": { "lan": "", "open": int(open_subtitle), }, }, ) return r def retry_add(): for i in range(max_retry): r = add() json = r.json() code = json['code'] if code == 0: return r # {"code":20001,"message":"投稿服务异常","ttl":1} if code in (20001, ): log.info('retry add video {}/{}, {}'.format( i, max_retry, r.text)) else: raise Exception('Fail to add video, {}'.format(r.text)) log.info('sleep %ds', 5 * i) time.sleep(5 * i) raise Exception('Add video reach max retry times.') r = retry_add() return r.json()
def __init__(self, host, port, token, index, allow_overrides=False, debug=False, flush_interval=15.0, force_keep_ahead=False, hostname=None, protocol='https', proxies=None, queue_size=DEFAULT_QUEUE_SIZE, record_format=False, retry_backoff=2.0, retry_count=5, source=None, sourcetype='text', timeout=60, verify=True): """ Args: host (str): The Splunk host param port (int): The port the host is listening on token (str): Authentication token index (str): Splunk index to write to allow_overrides (bool): Whether to look for _<param> in log data (ex: _index) debug (bool): Whether to print debug console messages flush_interval (float): How often thread should run to push events to splunk host force_keep_ahead (bool): Sleep instead of dropping logs when queue fills hostname (str): The Splunk Enterprise hostname protocol (str): The web protocol to use proxies (list): The proxies to use for the request queue_size (int): The max number of logs to queue, set to 0 for no max record_format (bool): Whether the log record will be json retry_backoff (float): The requests lib backoff factor retry_count (int): The number of times to retry a failed request source (str): The Splunk source param sourcetype (str): The Splunk sourcetype param timeout (float): The time to wait for a response from Splunk verify (bool): Whether to perform ssl certificate validation """ global instances instances.append(self) logging.Handler.__init__(self) self.allow_overrides = allow_overrides self.host = host self.port = port self.token = token self.index = index self.source = source self.sourcetype = sourcetype self.verify = verify self.timeout = timeout self.flush_interval = flush_interval self.force_keep_ahead = force_keep_ahead self.log_payload = "" self.SIGTERM = False # 'True' if application requested exit self.timer = None # It is possible to get 'behind' and never catch up, so we limit the queue size self.queue = list() self.max_queue_size = max(queue_size, 0) # 0 is min queue size self.debug = debug self.session = requests.Session() self.retry_count = retry_count self.retry_backoff = retry_backoff self.protocol = protocol self.proxies = proxies self.record_format = record_format # Keep ahead depends on queue size, so cannot be 0 if self.force_keep_ahead and not self.max_queue_size: self.write_log( "Cannot keep ahead of unbound queue, using default queue size") self.max_queue_size = DEFAULT_QUEUE_SIZE self.write_debug_log("Starting debug mode") if hostname is None: self.hostname = socket.gethostname() else: self.hostname = hostname self.write_debug_log("Preparing to override loggers") # prevent infinite recursion by silencing requests and urllib3 loggers logging.getLogger('requests').propagate = False logging.getLogger('urllib3').propagate = False # and do the same for ourselves logging.getLogger(__name__).propagate = False # disable all warnings from urllib3 package if not self.verify: requests.packages.urllib3.disable_warnings() if self.verify and self.protocol == 'http': print("[SplunkHandler DEBUG] " + 'cannot use SSL Verify and unsecure connection') if self.proxies is not None: self.session.proxies = self.proxies # Set up automatic retry with back-off self.write_debug_log("Preparing to create a Requests session") retry = Retry( total=self.retry_count, backoff_factor=self.retry_backoff, method_whitelist=False, # Retry for any HTTP verb status_forcelist=[500, 502, 503, 504]) self.session.mount(self.protocol + '://', HTTPAdapter(max_retries=retry)) self.start_worker_thread() self.write_debug_log("Class initialize complete")
class Youdao(AbstractDictionary): name = '有道词典' timeout = 10 headers = { 'Host': 'dict.youdao.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504]) session = requests.Session() session.mount('http://', HTTPAdapter(max_retries=retries)) session.mount('https://', HTTPAdapter(max_retries=retries)) def __init__(self): self.groups = [] def login(self, username: str, password: str, cookie: dict = None) -> dict: """ 登陆 :param username: 用户名 :param password: 密码 :param cookie: cookie :return: cookie dict """ self.session.cookies.clear() if cookie and self._checkCookie(cookie): return cookie else: return self._login(username, password) def _checkCookie(self, cookie) -> bool: """ cookie有效性检验 :param cookie: :return: bool """ rsp = requests.get('http://dict.youdao.com/wordbook/wordlist', cookies=cookie, headers=self.headers) if 'account.youdao.com/login' not in rsp.url: self.indexSoup = BeautifulSoup(rsp.text, features="html.parser") logger.info('Cookie有效') cookiesJar = requests.utils.cookiejar_from_dict(cookie, cookiejar=None, overwrite=True) self.session.cookies = cookiesJar return True logger.info('Cookie失效') return False def _login(self, username: str, password: str) -> dict: """账号和密码登陆""" data = ( ('app', 'mobile'), ('product', 'DICT'), ('tp', 'urstoken'), ('cf', '7'), ('show', 'true'), ('format', 'json'), ('username', username), ('password', hashlib.md5(password.encode('utf-8')).hexdigest()), ('um', 'true'), ) try: self.session.post(url='https://dict.youdao.com/login/acc/login', timeout=self.timeout, headers=self.headers, data=data) cookie = requests.utils.dict_from_cookiejar(self.session.cookies) if username and username.lower() in cookie.get('DICT_SESS', ''): # 登陆后获取单词本首页的soup对象 rsp = self.session.get( 'http://dict.youdao.com/wordbook/wordlist', timeout=self.timeout) self.indexSoup = BeautifulSoup(rsp.text, features="html.parser") logger.info('登陆成功') return cookie else: logger.error('登陆失败') return {} except Exception as error: logger.exception(f'网络异常:{error}') return {} def getGroups(self) -> [(str, int)]: """ 获取单词本分组 :return: [(group_name,group_id)] """ elements = self.indexSoup.find('select', id='select_category') groups = [] if elements: groups = elements.find_all('option') groups = [(e.text, e['value']) for e in groups] logger.info(f'单词本分组:{groups}') self.groups = groups return groups def getTotalPage(self, groupName: str, groupId: int) -> int: """ 获取分组下总页数 :param groupName: 分组名称 :param groupId:分组id :return: """ totalPages = 1 try: r = self.session.get( url='http://dict.youdao.com/wordbook/wordlist', timeout=self.timeout, params={'tags': groupId}) soup = BeautifulSoup(r.text, features='html.parser') pagination = soup.find('div', id='pagination') if pagination: finalPageHref = pagination.find_all( 'a', class_='next-page')[-1].get('href') groups = re.search(r"wordlist\?p=(\d*)", finalPageHref) if groups: totalPages = int(groups.group(1)) else: totalPages = 1 except Exception as error: logger.exception(f'网络异常{error}') finally: totalPages = totalPages - 1 if totalPages > 1 else totalPages logger.info(f'该分组({groupName}-{groupId})下共有{totalPages}页') return totalPages def getWordsByPage(self, pageNo: int, groupName: str, groupId: str) -> [str]: """ 获取分组下每一页的单词 :param pageNo: 页数 :param groupName: 分组名 :param groupId: 分组id :return: """ wordList = [] try: logger.info(f'获取单词本(f{groupName}-{groupId})第:{pageNo + 1}页') rsp = self.session.get( 'http://dict.youdao.com/wordbook/wordlist', params={ 'p': pageNo, 'tags': groupId }, ) soup = BeautifulSoup(rsp.text, features='html.parser') table = soup.find(id='wordlist').table.tbody rows = table.find_all('tr') for row in rows: cols = row.find_all('td') wordList.append(cols[1].div.a.text.strip()) except Exception as e: logger.exception(f'网络异常{e}') finally: logger.info(wordList) return wordList
import requests, json, os, sys from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry import mapping.mapping as m import conf as c REQ_SESSION = requests.Session() retries = Retry(total=10, backoff_factor=1, status_forcelist=[502, 503, 504, 524]) REQ_SESSION.mount('http://', HTTPAdapter(max_retries=retries)) #REQ_SESSION.mount('https://', HTTPAdapter(max_retries=retries)) params = { 'key_identity': c.CONF["KEY_IDENTITY"], 'key_credential': c.CONF["KEY_CREDENTIALS"] } #print("\nc.CONF",c.CONF) print("\n") ######################## ############ manually ######################## # 1. upload ontologies # 2. upload custom controlled vocabularies: use the same names as in vocabularies.json # 3. copy the ids of vocabularies in vocabularies.json and substitute IDs in "templates" folder (except for City,District,Country) # 4. reconcile City,District,Country to geonames and save mappings in vocabularies.json # 5. upload templates where vocabularies are already selected (import in next instances -- control vocab ids match with correct number) # 6. download google spreadsheet tables as tsv in "tables" folder
def get_connection(self, url, proxies=None): url = url.replace(self.redirect_source, self.redirect_target) return HTTPAdapter.get_connection(self, url, proxies=proxies)
def set_max_retry(self, for_url, max_retries): self._session.mount(for_url, HTTPAdapter(max_retries=max_retries))
class BetamaxAdapter(BaseAdapter): """This object is an implementation detail of the library. It is not meant to be a public API and is not exported as such. """ def __init__(self, **kwargs): super(BetamaxAdapter, self).__init__() self.cassette = None self.cassette_name = None self.old_adapters = kwargs.pop('old_adapters', {}) self.http_adapter = HTTPAdapter(**kwargs) self.serialize = None self.options = {} def cassette_exists(self): if self.cassette_name and os.path.exists(self.cassette_name): return True return False def close(self): self.http_adapter.close() def eject_cassette(self): if self.cassette: self.cassette.eject() self.cassette = None # Allow self.cassette to be garbage-collected def load_cassette(self, cassette_name, serialize, options): self.cassette_name = cassette_name self.serialize = serialize self.options.update(options.items()) placeholders = self.options.get('placeholders', []) default_options = Cassette.default_cassette_options match_requests_on = self.options.get( 'match_requests_on', default_options['match_requests_on'] ) preserve_exact_body_bytes = self.options.get( 'preserve_exact_body_bytes', ) self.cassette = Cassette( cassette_name, serialize, placeholders=placeholders, record_mode=self.options.get('record'), preserve_exact_body_bytes=preserve_exact_body_bytes ) if 'record' in self.options: self.cassette.record_mode = self.options['record'] self.cassette.match_options = match_requests_on re_record_interval = timedelta.max if self.options.get('re_record_interval'): re_record_interval = timedelta(self.options['re_record_interval']) now = datetime.utcnow() if re_record_interval < (now - self.cassette.earliest_recorded_date): self.cassette.clear() def send(self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None): interaction = None if not self.cassette: raise BetamaxError('No cassette was specified or found.') if self.cassette.interactions: interaction = self.cassette.find_match(request) if not interaction and self.cassette.is_recording(): interaction = self.send_and_record( request, stream, timeout, verify, cert, proxies ) if not interaction: raise BetamaxError(unhandled_request_message(request, self.cassette)) resp = interaction.as_response() resp.connection = self return resp def send_and_record(self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None): adapter = self.find_adapter(request.url) response = adapter.send( request, stream=True, timeout=timeout, verify=verify, cert=cert, proxies=proxies ) self.cassette.save_interaction(response, request) return self.cassette.interactions[-1] def find_adapter(self, url): for (prefix, adapter) in self.old_adapters.items(): if url.lower().startswith(prefix): return adapter
from bs4 import BeautifulSoup import requests, os, csv, urllib.parse, urllib3, time, random from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry urllib3.disable_warnings() retry_strategy = Retry( total=20, status_forcelist=[429, 500, 502, 503, 504], method_whitelist=["HEAD", "GET", "OPTIONS"], backoff_factor=2 ) adapter = HTTPAdapter(max_retries=retry_strategy) http = requests.Session() http.mount("https://", adapter) http.mount("http://", adapter) def get_proxies(): proxyList = [ {"http":"169.159.179.248:8080"}, {"http":"105.208.17.58:8080"}, {"http":"160.119.44.210:8080"}, {"http":"196.214.145.106:80"}, {"https":"41.194.37.106:45381"},{"https":"41.222.159.191:8080"},{"https":"66.251.179.207:8080"} ] return proxyList def rotate_proxy(proxies): proxy_select = random.randint(-1,len(proxies)-1) if proxy_select < 0 or os.getenv('proxy_enabled', default=False): time.sleep(random.uniform(0.5, 1.25)) return None
'url': download_url, 'expire': (now + expire).strftime('%Y-%m-%d') } UrlRecorder.save_record(entry['class_name'], record) def _request(self, entry, method, url, **kwargs): if not self.requests: self.requests = requests.Session() if entry_headers := entry.get('headers'): if brotli: entry_headers['accept-encoding'] = 'gzip, deflate, br' self.requests.headers.update(entry_headers) if entry_cookie := entry.get('cookie'): self.requests.cookies.update( NetUtils.cookie_str_to_dict(entry_cookie)) self.requests.mount('http://', HTTPAdapter(max_retries=2)) self.requests.mount('https://', HTTPAdapter(max_retries=2)) try: response = self.requests.request(method, url, timeout=60, **kwargs) if response is not None and response.content: if re.search( NetworkErrorReason.DDoS_protection_by_Cloudflare.value, NetUtils.decode(response)): cf_cookie = asyncio.run(SiteBase.get_cf_cookie(entry)) self.requests.cookies.update( NetUtils.cookie_str_to_dict(cf_cookie)) response = self.requests.request(method, url, timeout=60, **kwargs) return response
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36', } proxies = {'http': '127.0.0.1:8080'} inject_point = {'id': '2'} conf = { 'url': url, 'method': 'post', 'proxies': {}, 'inject': inject_point, 'debug': False } conf['proxies'] = proxies req = requests.session() req.headers = headers req.mount('http://', HTTPAdapter(max_retries=3)) req.mount('https://', HTTPAdapter(max_retries=3)) req.proxies = conf['proxies'] def main(): print( 'Help info\nPlease provide your action:[get_current_user|get_current_db|get_dbs|get_tables|get_columns|read_file|dump|dump_all]' ) print('main') act = raw_input("Please raw_input your action:") action(act) def put_file_contents(filename, contents): with open(filename, "a+") as fin:
# Alex Mueller # # # ####################### import os import io import requests from requests.adapters import HTTPAdapter from requests.exceptions import ConnectionError from PIL import Image, ImageOps, ImageDraw from io import BytesIO SITE_URL = 'https://images.alexonsager.net' SPRITES = './Assets/Sprites' pokemonAdapter = HTTPAdapter(max_retries = 10) session = requests.Session() session.mount(SITE_URL, pokemonAdapter) pokemon = ["Missingno.","Bulbasaur","Ivysaur","Venusaur","Charmander","Charmeleon","Charizard","Squirtle","Wartortle","Blastoise","Caterpie","Metapod","Butterfree","Weedle","Kakuna","Beedrill","Pidgey","Pidgeotto","Pidgeot","Rattata","Raticate","Spearow","Fearow","Ekans","Arbok","Pikachu","Raichu","Sandshrew","Sandslash","Nidoran(f)","Nidorina","Nidoqueen","Nidoran(m)","Nidorino","Nidoking","Clefairy","Clefable","Vulpix","Ninetales","Jigglypuff","Wigglytuff","Zubat","Golbat","Oddish","Gloom","Vileplume","Paras","Parasect","Venonat","Venomoth","Diglett","Dugtrio","Meowth","Persian","Psyduck","Golduck","Mankey","Primeape","Growlithe","Arcanine","Poliwag","Poliwhirl","Poliwrath","Abra","Kadabra","Alakazam","Machop","Machoke","Machamp","Bellsprout","Weepinbell","Victreebel","Tentacool","Tentacruel","Geodude","Graveler","Golem","Ponyta","Rapidash","Slowpoke","Slowbro","Magnemite","Magneton","Farfetchd","Doduo","Dodrio","Seel","Dewgong","Grimer","Muk","Shellder","Cloyster","Gastly","Haunter","Gengar","Onix","Drowzee","Hypno","Krabby","Kingler","Voltorb","Electrode","Exeggcute","Exeggutor","Cubone","Marowak","Hitmonlee","Hitmonchan","Lickitung","Koffing","Weezing","Rhyhorn","Rhydon","Chansey","Tangela","Kangaskhan","Horsea","Seadra","Goldeen","Seaking","Staryu","Starmie","Mr. Mime","Scyther","Jynx","Electabuzz","Magmar","Pinsir","Tauros","Magikarp","Gyarados","Lapras","Ditto","Eevee","Vaporeon","Jolteon","Flareon","Porygon","Omanyte","Omastar","Kabuto","Kabutops","Aerodactyl","Snorlax","Articuno","Zapdos","Moltres","Dratini","Dragonair","Dragonite","Mewtwo","Mew"] prefixes = ["Miss","Bulb","Ivy","Venu","Char","Char","Char","Squirt","War","Blast","Cater","Meta","Butter","Wee","Kak","Bee","Pid","Pidg","Pidg","Rat","Rat","Spear","Fear","Ek","Arb","Pika","Rai","Sand","Sand","Nido","Nido","Nido","Nido","Nido","Nido","Clef","Clef","Vul","Nine","Jiggly","Wiggly","Zu","Gol","Odd","Gloo","Vile","Pa","Para","Veno","Veno","Dig","Dug","Meow","Per","Psy","Gol","Man","Prime","Grow","Arca","Poli","Poli","Poli","Ab","Kada","Ala","Ma","Ma","Ma","Bell","Weepin","Victree","Tenta","Tenta","Geo","Grav","Gol","Pony","Rapi","Slow","Slow","Magne","Magne","Far","Do","Do","See","Dew","Gri","Mu","Shell","Cloy","Gas","Haunt","Gen","On","Drow","Hyp","Krab","King","Volt","Electr","Exegg","Exegg","Cu","Maro","Hitmon","Hitmon","Licki","Koff","Wee","Rhy","Rhy","Chan","Tang","Kangas","Hors","Sea","Gold","Sea","Star","Star","Mr.","Scy","Jyn","Electa","Mag","Pin","Tau","Magi","Gyara","Lap","Dit","Ee","Vapor","Jolt","Flare","Pory","Oma","Oma","Kabu","Kabu","Aero","Snor","Artic","Zap","Molt","Dra","Dragon","Dragon","Mew","Mew"] postfixes = ["ssingno.","basaur","ysaur","usaur","mander","meleon","izard","tle","tortle","toise","pie","pod","free","dle","una","drill","gey","eotto","eot","tata","icate","row","row","kans","bok","chu","chu","shrew","slash","oran","rina","queen","ran","rino","king","fairy","fable","pix","tales","puff","tuff","bat","bat","ish","oom","plume","ras","sect","nat","moth","lett","trio","th","sian","duck","duck","key","ape","lithe","nine","wag","whirl","wrath","ra","bra","kazam","chop","choke","champ","sprout","bell","bell","cool","cruel","dude","eler","em","ta","dash","poke","bro","mite","ton","fetchd","duo","drio","eel","gong","mer","uk","der","ster","tly","ter","gar","ix","zee","no","by","ler","orb","ode","cute","utor","bone","wak","lee","chan","tung","fing","zing","horn","don","sey","gela","khan","sea","dra","deen","king","yu","mie","mime","ther","nx","buzz","mar","sir","ros","karp","dos","ras","to","vee","eon","eon","eon","gon","nyte","star","to","tops","dactyl","lax","cuno","dos","tres","tini","nair","nite","two","ew"] if not os.path.exists(SPRITES): os.makedirs(SPRITES) for i in range(152): for j in range(152): name = "" url = "%s/pokemon/fused/%d/%d.%d.png" % (SITE_URL, i, i, j)
__credits__ = ["Huevos", "WanWizard"] __license__ = "GPL" __version__ = "1.0.1" POLARISATION = {'H': 0, 'V': 1, 'L': 2, 'R': 3} SYSTEMS = {'DVB-S': 0, 'DVB-S2': 1, 'DSS': -1, 'ISDB': -1, 'Digicipher 2': -1, 'ABS': -1} FECS = {'auto': 0, '1/2': 1, '2/3': 2, '3/4': 3, '3/5': 4, '4/5': 5, '5/6': 6, '6/7': 7, '7/8': 8, '8/9': 9, '9/10': 10, 'none': 15} MODULATIONS = {'auto': 0, 'QPSK': 1, '8PSK':2, 'QAM16': 3, '16APSK': 4, '32APSK': 5, '8PSK Turbo': -1, 'Turbo': -1} SLEEP_TIMEOUT = 10 PARSER = 'html5lib' SESSION = requests.Session() SESSION.mount('http://', HTTPAdapter(max_retries=5)) SESSION.mount('https://', HTTPAdapter(max_retries=5)) SESSION.headers.update({'User-Agent': 'Mozilla/5.0'}) IS_PY3 = sys.version_info >= (3, 0) def eprint(*args, **kwargs): """ print data in std error """ print(*args, file=sys.stderr, **kwargs) def escape(title): """ xml escape title """ title = title.replace('&', '&').replace('<', '<') return title.replace('>', '>').replace('\"', '"')
def download_file(URL, name_file, path_out, retries=10, backoff=10, size_format='Decimal', show_download_progress=True): ''' Save data in file. Parameters ---------- URL : str Link of file. name_file : str Name of output file. path_out : str, optional, default '' Path of folder where file will be saved. retries : int, optional, default 10 Defines the retries number to connect to server. See: https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry backoff: int, optional, default 10 A backoff factor to apply between attempts after the second try. See: https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry size_format: str, optional, default 'Decimal' Defines how is print the size of file. Options are: 'Decimal' : divide file size (in bytes) by (1000*1000) 'Binary' : divide file size (in bytes) by (1024*1024) show_download_progress : boolean, optional, default True Parameter to enable and disable the visualization of download progress. ''' StartTime = datetime.now() retries_config = Retry(total=retries, backoff_factor=backoff, status_forcelist=[500, 502, 503, 504]) session = requests.Session() session.mount('http://', HTTPAdapter(max_retries=retries_config)) session.mount('https://', HTTPAdapter(max_retries=retries_config)) req = session.get(URL, stream=True) #req = requests.get(URL, stream=True) total_size = int(req.headers['content-length']) size = 0 if size_format == 'Binary': dsize = 1024 * 1024 else: dsize = 1000 * 1000 with open(path_out + name_file, 'wb') as output_file: for chunk in req.iter_content(chunk_size=1024): if chunk: rec_size = output_file.write(chunk) size = rec_size + size if show_download_progress == True: print(' {} {:3.0f}% {:.1f}MB {}'.format( name_file, 100.0 * size / total_size, size / dsize, '{}m{}s'.format( round((datetime.now() - StartTime).seconds / 60.0), (datetime.now() - StartTime).seconds % 60) if (datetime.now() - StartTime).seconds > 60 else '{}s'.format((datetime.now() - StartTime).seconds)), end="\r") #, flush=True) #print('\t{}\t{:3.0f}%\t{:.2f} min'.format(name_file,100.0*size/total_size, (datetime.now()-StartTime).seconds/60.0), end="\r") #, flush=True) if size == total_size: #print('\n') print(' {} {:3.0f}% {:.1f}MB {}'.format( name_file, 100.0 * size / total_size, size / dsize, '{}m{}s'.format( round((datetime.now() - StartTime).seconds / 60.0), (datetime.now() - StartTime).seconds % 60) if (datetime.now() - StartTime).seconds > 60 else '{}s'.format( (datetime.now() - StartTime).seconds)))
def make_request(method, url, conn, stream=False, **kwargs): """ Makes a REST request. Parameters ---------- method : {'GET', 'POST', 'PUT', 'DELETE'} HTTP method. url : str URL. conn : :class:`Connection` Connection authentication and configuration. stream : bool, default False Whether to stream the response contents. **kwargs Initialization arguments to :class:`requests.Request`. Returns ------- :class:`requests.Response` """ if method.upper() not in _VALID_HTTP_METHODS: raise ValueError( "`method` must be one of {}".format(_VALID_HTTP_METHODS)) # add auth to headers kwargs.setdefault('headers', {}).update(conn.auth) with requests.Session() as session: session.mount(url, HTTPAdapter(max_retries=conn.retry)) try: request = requests.Request(method, url, **kwargs).prepare() # retry loop for broken connections MAX_RETRIES = conn.retry.total for retry_num in range(MAX_RETRIES + 1): logger.debug("Making request ({} retries)".format(retry_num)) try: response = _make_request(session, request, conn.ignore_conn_err, stream=stream) except requests.ConnectionError as e: if ((retry_num == MAX_RETRIES) or ("BrokenPipeError" not in str(e))): if not conn.ignore_conn_err: raise e else: return fabricate_200() time.sleep(1) else: break except (requests.exceptions.BaseHTTPError, requests.exceptions.RequestException) as e: if not conn.ignore_conn_err: raise e # else fall through to fabricate 200 response else: if response.ok or not conn.ignore_conn_err: return response # else fall through to fabricate 200 response return fabricate_200()
class HttpNtlmAuth(AuthBase): """HTTP NTLM Authentication Handler for Requests. Supports pass-the-hash.""" def __init__(self, username, password): """ :username - Username in 'domain\\username' format :password - Password or hash in "ABCDABCDABCDABCD:ABCDABCDABCDABCD" format. """ if ntlm is None: raise Exception("NTLM libraries unavailable") #parse the username try: self.domain, self.username = username.split('\\', 1) except ValueError: raise ValueError("username should be in 'domain\\username' format.") self.domain = self.domain.upper() self.password = password self.adapter = HTTPAdapter() def retry_using_http_NTLM_auth(self, auth_header_field, auth_header, response, args): """Attempts to authenticate using HTTP NTLM challenge/response""" if auth_header in response.request.headers: return response request = copy_request(response.request) # initial auth header with username. will result in challenge auth = 'NTLM %s' % ntlm.create_NTLM_NEGOTIATE_MESSAGE("%s\\%s" % (self.domain,self.username)) request.headers[auth_header] = auth # we must keep the connection because NTLM authenticates the connection, not single requests request.headers["Connection"] = "Keep-Alive" # A streaming response breaks authentication. # This can be fixed by not streaming this request, which is safe because # the returned response3 will still have stream=True set if specified in # args. In addition, we expect this request to give us a challenge # and not the real content, so the content will be short anyway. args_nostream = dict(args, stream=False) response2 = self.adapter.send(request, **args_nostream) # this is important for some web applications that store authentication-related info in cookies (it took a long time to figure out) if response2.headers.get('set-cookie'): request.headers['Cookie'] = response2.headers.get('set-cookie') # get the challenge auth_header_value = response2.headers[auth_header_field] ntlm_header_value = list(filter(lambda s: s.startswith('NTLM '), auth_header_value.split(',')))[0].strip() ServerChallenge, NegotiateFlags = ntlm.parse_NTLM_CHALLENGE_MESSAGE(ntlm_header_value[5:]) # build response request = copy_request(request) auth = 'NTLM %s' % ntlm.create_NTLM_AUTHENTICATE_MESSAGE(ServerChallenge, self.username, self.domain, self.password, NegotiateFlags) request.headers[auth_header] = auth response3 = self.adapter.send(request, **args) # Update the history. response3.history.append(response) response3.history.append(response2) return response3 def response_hook(self, r, **kwargs): if r.status_code == 401 and 'ntlm' in r.headers.get('www-authenticate','').lower(): return self.retry_using_http_NTLM_auth('www-authenticate', 'Authorization', r, kwargs) if r.status_code == 407 and 'ntlm' in r.headers.get('proxy-authenticate','').lower(): return self.retry_using_http_NTLM_auth('proxy-authenticate', 'Proxy-authorization', r, kwargs) return r def __call__(self, r): r.register_hook('response', self.response_hook) return r
from multiprocessing import Manager from multiprocessing.pool import ThreadPool from time import sleep import logging import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from urllib3.exceptions import HTTPError import numpy as np import pandas as pd session = requests.session() retry = Retry(connect=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) logger = logging.getLogger(__name__) class AMAPCrawler: def __init__(self, keys: (list, tuple, str) = None): if keys is None: self.keys = ['87a08092f3e9c212e6f06e6327d9f385'] else: if isinstance(keys, str): keys = [keys] self.keys = keys
def __init__(self, url, method, data=None, kerberos_auth=False, allow_redirects=True, verify_ssl=True, ca=None, use_json=False, headers=None, stream=False, username=None, password=None, client_cert=None, client_key=None, verbose=False, retries_enabled=True): def log_error_response_text_hook(resp, *args, **kwargs): """requests hook to log error response""" if 400 <= resp.status_code <= 599: logger.debug('Error response from "%r": "%r"', resp.url, resp.text) self.finished = False # have we read all data? self.closed = False # have we destroyed curl resources? self.status_code = 0 self.headers = None retry = Retry( total=HTTP_MAX_RETRIES, connect=HTTP_MAX_RETRIES, backoff_factor=HTTP_BACKOFF_FACTOR, status_forcelist=HTTP_RETRIES_STATUS_FORCELIST, method_whitelist=HTTP_RETRIES_METHODS_WHITELIST, raise_on_status=False, ) self.session = requests.Session() self.session.hooks['response'] = [log_error_response_text_hook] if retries_enabled: self.session.mount('http://', HTTPAdapter(max_retries=retry)) self.session.mount('https://', HTTPAdapter(max_retries=retry)) self.url = url headers = headers or {} method = method.lower() if method not in ['post', 'get', 'put', 'patch', 'delete']: raise RuntimeError("Unsupported method '%s' for curl call!" % method) args = {} if method in ['post', 'put', 'patch']: headers['Expect'] = '' if not verify_ssl: args['verify'] = False else: if ca: args['verify'] = ca else: args['verify'] = True if username and password: args['auth'] = (username, password) if client_cert and client_key: args['cert'] = (client_cert, client_key) if data: args['data'] = data if use_json: headers['Content-Type'] = 'application/json' args['allow_redirects'] = allow_redirects if kerberos_auth: if not HTTPKerberosAuth: raise RuntimeError('Kerberos auth unavailable') args['auth'] = HTTPKerberosAuth() if stream: args['stream'] = True args['headers'] = headers args['timeout'] = HTTP_REQUEST_TIMEOUT self.req = self.session.request(method, url, **args) self.headers = self.req.headers self.status_code = self.req.status_code
def get_bars(symbols_list, outdir, start, end): """ Description: \n Pulls the data for aggregated bars for symbols given. \n NOTE: These values are unadjusted since splits are adjusted for manually \n Parameters: \n symbols_list ([str]): list of tickers \n outdir (str): folder to put csv file in \n start (str): start date to start collecting bar data \n end (str): end date to stop collecting bar data \n """ session = requests.Session() # In case I run into issues, retry my connection retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[ 500, 502, 503, 504 ]) session.mount('http://', HTTPAdapter(max_retries=retries)) count = 0 barlog = open("barlog.txt", "w") for symbol in symbols_list: try: r = session.get(POLYGON_AGGS_URL.format(symbol, start, end, POLYGON_API_KEY)) if r: data = r.json() # create a pandas dataframe from the information if data['queryCount'] > 0: df = pd.DataFrame(data['results']) df['t'] = pd.to_datetime(df['t'], unit='ms') df['date'] = pd.to_datetime(df['t'], unit='ms') df['date'] = df['date'].dt.date.astype(str) df.set_index('date', inplace=True) df['symbol'] = symbol df.drop(columns=['vw', 'n'], inplace=True) df.rename(columns={'v': 'volume', 'o': 'open', 'c': 'close', 'h': 'high', 'l': 'low', 't': 'date'}, inplace=True) df.to_csv('{}/{}.csv'.format(outdir, symbol), index=True) count += 1 # Logging, I could write a short method for this to reuse msg = (symbol + ' file created with record count ' + str(data['queryCount'])) print(msg) barlog.write(msg) barlog.write("\n") else: msg = ('No data for symbol ' + str(symbol)) print(msg) barlog.write(msg) barlog.write("\n") else: msg = ('No response for symbol ' + str(symbol)) print(msg) barlog.write(msg) barlog.write("\n") # Raise exception but continue except: msg = ('****** exception raised for symbol ' + str(symbol)) print(msg) barlog.write(msg) barlog.write("\n") barlog.close() return ('{} file were exported'.format(count))
def __init__(self, *args, **kwargs): self.redirect_source = kwargs.pop("source") self.redirect_target = kwargs.pop("target") HTTPAdapter.__init__(self, *args, **kwargs)
import logging import time from concurrent import futures import pyquery import requests from requests.adapters import HTTPAdapter from libs.request import Request DEFAULT_POOL_SIZE = 100 HTTP_ADAPTER = HTTPAdapter(pool_connections=DEFAULT_POOL_SIZE, pool_maxsize=DEFAULT_POOL_SIZE + 300) class BaseCrawler: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'} self.session = requests.session() self.session.mount('https://', HTTP_ADAPTER) self.session.mount('http://', HTTP_ADAPTER) self.charset = 'utf-8' self.logger = logging.getLogger(self.__class__.__name__) self.config = {} self.error_request = [] def crawl(self, iterables, thread=None): result_list = [] n = len(iterables)
from pybatfish.datamodel.referencelibrary import ( # noqa: F401 NodeRoleDimension, NodeRolesData, ReferenceBook) from pybatfish.settings.issues import IssueConfig # noqa: F401 from pybatfish.util import BfJsonEncoder from .options import Options # suppress the urllib3 warnings due to old version of urllib3 (inside requests) requests.packages.urllib3.disable_warnings(InsecureRequestWarning) # Setup a session, configure retry policy _requests_session = requests.Session() # Prefix "http" will cover both "http" & "https" _requests_session.mount( "http", HTTPAdapter( max_retries=Retry(connect=Options.max_tries_to_connect_to_coordinator, backoff_factor=Options.request_backoff_factor))) _encoder = BfJsonEncoder() __all__ = [ 'add_issue_config', 'add_node_role_dimension', 'add_reference_book', 'delete_issue_config', 'fork_snapshot', 'get_issue_config', 'get_network', 'get_node_role_dimension', 'get_node_roles', 'get_reference_book', 'get_reference_library', 'read_question_settings', 'write_question_settings' ] def add_issue_config(session, issue_config): # type: (Session, IssueConfig) -> None
def get_response(link, retry_params=None, headers=None, timeout=None, proxies=None, session=None): """ get_response gets the responses of the a URL. :param link: link to the content to be recieved :type link: str :param retry_params: rules to retry :type retry_params: dict, optional :param headers: headers for the request :type headers: dict, optional :param timeout: timeout parameters for the request :type timeout: tuple, optional :param proxies: proxies :type proxies: dict, optional :param session: a session object to be used :type session: requests.Session, optional :return: response from the url :rtype: requests.models.Response """ if retry_params is None: retry_params = {} retry_params = { **{ 'retries': 5, 'backoff_factor': 0.3, 'status_forcelist': (500, 502, 504) }, **retry_params } if headers is None: headers = random_user_agent() if timeout is None: timeout = (5, 14) if session is None: session = requests.Session() if proxies is None: proxies = {} retry = Retry( total=retry_params.get('retries'), read=retry_params.get('retries'), connect=retry_params.get('retries'), backoff_factor=retry_params.get('backoff_factor'), status_forcelist=retry_params.get('status_forcelist'), ) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) page = session.get(link, headers=headers, proxies=proxies) status = page.status_code return page
class BetamaxAdapter(BaseAdapter): """This object is an implementation detail of the library. It is not meant to be a public API and is not exported as such. """ def __init__(self, **kwargs): super(BetamaxAdapter, self).__init__() self.cassette = None self.cassette_name = None self.http_adapter = HTTPAdapter(**kwargs) self.serialize = None self.options = {} def cassette_exists(self): if self.cassette_name and os.path.exists(self.cassette_name): return True return False def close(self): self.http_adapter.close() def eject_cassette(self): if self.cassette: self.cassette.eject() self.cassette = None # Allow self.cassette to be garbage-collected def load_cassette(self, cassette_name, serialize, options): self.cassette_name = cassette_name self.serialize = serialize self.options.update(options) placeholders = self.options.get('placeholders') # load cassette into memory if self.cassette_exists(): self.cassette = Cassette(cassette_name, serialize, placeholders=placeholders) elif os.path.exists(os.path.dirname(cassette_name)): self.cassette = Cassette(cassette_name, serialize, 'w+', placeholders=placeholders) else: raise RuntimeError( 'No cassette could be loaded or %s does not exist.' % os.path.dirname(cassette_name) ) self.cassette.record_mode = self.options['record'] re_record_interval = timedelta.max if self.options.get('re_record_interval'): re_record_interval = timedelta(self.options['re_record_interval']) now = datetime.utcnow() if re_record_interval < (now - self.cassette.earliest_recorded_date): self.cassette.clear() def send(self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None): interaction = None match_on = Cassette.default_cassette_options['match_requests_on'] response = None if not self.cassette: raise BetamaxError('No cassette was specified or found.') if self.cassette.interactions: self.cassette.match_options = set(match_on) interaction = self.cassette.find_match(request) if not interaction and self.cassette.is_recording(): response = self.http_adapter.send( request, stream=True, timeout=timeout, verify=verify, cert=cert, proxies=proxies ) self.cassette.save_interaction(response, request) interaction = self.cassette.interactions[-1] if not interaction: raise BetamaxError('A request was made that could not be handled') return interaction.as_response()
class BetamaxAdapter(BaseAdapter): """This object is an implementation detail of the library. It is not meant to be a public API and is not exported as such. """ def __init__(self, **kwargs): super(BetamaxAdapter, self).__init__() self.cassette = None self.cassette_name = None self.old_adapters = kwargs.pop('old_adapters', {}) self.http_adapter = HTTPAdapter(**kwargs) self.serialize = None self.options = {} def cassette_exists(self): """Check if cassette exists on file system. :returns: bool -- True if exists, False otherwise """ if self.cassette_name and os.path.exists(self.cassette_name): return True return False def close(self): """Propagate close to underlying adapter.""" self.http_adapter.close() def eject_cassette(self): """Eject currently loaded cassette.""" if self.cassette: self.cassette.eject() self.cassette = None # Allow self.cassette to be garbage-collected def load_cassette(self, cassette_name, serialize, options): """Load cassette. Loads a previously serialized http response as a cassette :param str cassette_name: (required), name of cassette :param str serialize: (required), type of serialization i.e 'json' :options dict options: (required), options for cassette """ self.cassette_name = cassette_name self.serialize = serialize self.options.update(options.items()) placeholders = self.options.get('placeholders', {}) cassette_options = {} default_options = cassette.Cassette.default_cassette_options match_requests_on = self.options.get( 'match_requests_on', default_options['match_requests_on'] ) cassette_options['preserve_exact_body_bytes'] = self.options.get( 'preserve_exact_body_bytes', ) cassette_options['allow_playback_repeats'] = self.options.get( 'allow_playback_repeats' ) cassette_options['record_mode'] = self.options.get('record') for option, value in list(cassette_options.items()): if value is None: cassette_options.pop(option) self.cassette = cassette.Cassette( cassette_name, serialize, placeholders=placeholders, cassette_library_dir=self.options.get('cassette_library_dir'), **cassette_options ) if 'record' in self.options: self.cassette.record_mode = self.options['record'] # NOTE(sigmavirus24): Cassette.match_options is a set, might as well # use that instead of overriding it. self.cassette.match_options.update(match_requests_on) re_record_interval = timedelta.max if self.options.get('re_record_interval'): re_record_interval = timedelta(self.options['re_record_interval']) now = datetime.utcnow() if re_record_interval < (now - self.cassette.earliest_recorded_date): self.cassette.clear() def send(self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None): """Send request. :param request request: request :returns: A Response object """ interaction = None current_cassette = self.cassette if not current_cassette: raise BetamaxError('No cassette was specified or found.') if current_cassette.interactions: interaction = current_cassette.find_match(request) if not interaction and current_cassette.is_recording(): interaction = self.send_and_record( request, stream, timeout, verify, cert, proxies ) if not interaction: raise BetamaxError(unhandled_request_message(request, current_cassette)) resp = interaction.as_response() resp.connection = self return resp def send_and_record(self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None): """Send request and record response. The response will be serialized and saved to a cassette which can be replayed in the future. :param request request: request :param bool stream: (optional) defer download until content is accessed :param float timeout: (optional) time to wait for a response :param bool verify: (optional) verify SSL certificate :param str cert: (optional) path to SSL client :param proxies dict: (optional) mapping protocol to URL of the proxy :return: Interaction :rtype: class:`betamax.cassette.Interaction` """ adapter = self.find_adapter(request.url) response = adapter.send( request, stream=True, timeout=timeout, verify=verify, cert=cert, proxies=proxies ) return self.cassette.save_interaction(response, request) def find_adapter(self, url): """Find adapter. Searches for an existing adapter where the url and prefix match. :param url str: (required) url of the adapter :returns: betamax adapter """ for (prefix, adapter) in self.old_adapters.items(): if url.lower().startswith(prefix): return adapter
from requests.packages.urllib3.util.retry import Retry from requests.adapters import HTTPAdapter import ssl from functools import wraps from config import ConfigSectionMap cfg = ConfigSectionMap('setup') START = cfg['start'] END = cfg['end'] session = requests.Session() retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]) session.proxies = proxies session.headers = headers session.mount('https://', HTTPAdapter(max_retries=retries)) def sslwrap(func): @wraps(func) def bar(*args, **kw): kw['ssl_version'] = ssl.PROTOCOL_TLSv1 return func(*args, **kw) return bar ssl.wrap_socket = sslwrap(ssl.wrap_socket) output_file = open(cfg['output_file'], 'a', encoding='utf-8') exception_urls_file = open(cfg['exception_url_file'], 'a', encoding='utf-8')
def __init__(self, base_url): self.base_url = base_url self.retry_session = requests.Session() self.retry_session.mount(self.base_url, HTTPAdapter( max_retries=Retry(total=3, status_forcelist=[503]) ))
def init_poolmanager(self, *args, **kwargs): kwargs["ssl_context"] = self.ssl_context return HTTPAdapter.init_poolmanager(self, *args, **kwargs)
def product_parse(self, response): if len(response.text) < 40000: yield scrapy.Request(url=response.request.url, callback=self.product_parse, dont_filter=True, meta=response.meta) return None item = response.meta['item'] # 商品链接 product_url = response.request.url # 商品ID ProductID = product_url.split('/')[-1].split('.')[0] # 商品链接urlID urlID = product_url.split('/')[-2] # 商品链接urlID urlID = product_url.split('/')[-2] # 店铺名称 try: shop_name = re.findall('shopName":"(.*?)"', response.text)[0] except: try: shop_name = re.findall('"curShopName":.*?>(.*?)</a>"', response.text)[0] except: try: shop_name = response.xpath( ".//div[@class='si-intro-list']/dl[1]/dd/a/text()" ).extract()[0] except: shop_name = None #去掉shopname中的空白字符 shop_name = re.sub(r'\r', '', shop_name) shop_name = re.sub(r'\t', '', shop_name) shop_name = re.sub(r'\n', '', shop_name) shop_name = re.sub(r' ', '', shop_name) # 商品名称 try: p_Name = response.xpath( ".//div[@class='imgzoom-main']/a[@id='bigImg']/img/@alt" ).extract()[0] except: try: p_Name = re.findall('"itemDisplayName":"(.*?)"', response.text)[0] except: p_Name = None #类别 try: X_type = Selector(response).re('"分类":"(.*?)"')[0] except: try: X_type = Selector(response).re( '分类</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: X_type = re.findall('"分类":"(.*?)"', response.text)[0] except: X_type = None # 品牌 try: brand = Selector(response).re('"brandName":"(.*?)"')[0] except: try: brand = Selector(response).re('<li><b>品牌</b>:(.*?)</li>')[0] except: try: brand = re.findall('"brandName":"(.*?)"', response.text)[0] except: brand = None # 去掉品牌括号内容 if brand: if re.findall(r'(.*?)', brand): re_com = re.compile('(.*?)') brand = brand[:0] + re.sub(re_com, '', brand) if brand: if re.findall(r'\(.*?\)', brand): re_cn = re.compile('\(.*?\)') brand = brand[:0] + re.sub(re_cn, '', brand) # 颜色 color = None # 类型,商品型号 try: X_name = Selector(response).re( '型号</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: X_name = re.findall( '型号</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] if X_name == None: X_name = re.findall( '型号</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: X_name = None if X_name: if brand: if brand in X_name: X_name = X_name[:0] + re.sub(brand, '', X_name) X_name = X_name[:0] + re.sub(r'(.*?)', '', X_name) X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name) #安装方式 try: install = Selector(response).re('安装方式:(.*?)</li>')[0] except: try: install = Selector(response).re( '安装方式</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: install = re.findall('安装方式:(.*?)</li>', response.text)[0] except: try: install = re.findall( '安装方式</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: install = None #是否可以直饮 try: drink = Selector(response).re('是否直饮:(.*?)</li>')[0] except: try: drink = Selector(response).re( '是否直饮</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: drink = re.findall('是否直饮:(.*?)</li>', response.text)[0] except: try: drink = re.findall( '是否直饮</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: drink = None #滤芯种类 try: kinds = Selector(response).re('滤芯种类:(.*?)</li>')[0] except: try: kinds = Selector(response).re( '滤芯种类</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: kinds = re.findall('滤芯种类:(.*?)</li>', response.text)[0] except: try: kinds = re.findall( '滤芯种类</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: kinds = None #滤芯使用寿命 try: life = Selector(response).re('滤芯寿命:(.*?)</li>')[0] except: try: life = Selector(response).re( '滤芯寿命</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: life = re.findall('滤芯寿命:(.*?)</li>', response.text)[0] except: try: life = re.findall( '滤芯寿命</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: life = None #过滤精度 try: precision = Selector(response).re('过滤精度:(.*?)</li>')[0] except: try: precision = Selector(response).re( '过滤精度</span> </div> </td> <td class="val">(.*?)</td>')[0] except: try: precision = re.findall('过滤精度:(.*?)</li>', response.text)[0] except: try: precision = re.findall( '过滤精度</span> </div> </td> <td class="val">(.*?)</td>', response.text)[0] except: precision = None # 核心参数 type = '"' soup = BeautifulSoup(response.text, 'lxml') try: ul = soup.find('ul', attrs={'class': 'cnt clearfix'}) li = ul.find_all('li') for i in range(len(li)): type = type[:] + li[i].text if i < len(li) - 1: type = type[:] + ' ' if i == len(li) - 1: type = type[:] + '"' except: try: # 部分核心参数格式更改 div = soup.find('div', class_='prod-detail-container') ul = div.find('ul', attrs={'class': 'clearfix'}) li = ul.find_all('li') for each in li: li_li = each.find_all('li') for i in range(len(li_li)): type = type[:] + li_li[i].text if i < len(li_li) - 1: type = type[:] + ' ' if i == len(li_li) - 1: type = type[:] + '"' except: type = None if type: if len(type) < 2: type = None if type == None: try: parameter_id = Selector(response).re( '"mainPartNumber":"(.*?)"')[0] except: try: parameter_id = re.findall('"mainPartNumber":"(.*?)"', response.text)[0] except: parameter_id = None type = None if parameter_id: try: parameter_id = Selector(response).re( '"mainPartNumber":"(.*?)"')[0] parameter_url = 'https://product.suning.com/pds-web/ajax/itemParameter_%s_R0105002_10051.html' % parameter_id para_response = requests.get(parameter_url).text time.sleep(0.3) eles = re.findall('"snparameterdesc":"(.*?)"', para_response) souls = re.findall('"snparameterVal":"(.*?)"', para_response) try: type = '"' for i in range(len(eles)): type = type[:] + eles[i] + ':' + souls[i] if i < len(eles) - 1: type = type[:] + ' ' if i == len(eles) - 1: type = type[:] + '"' if len(type) < 2: type = None except: type = None if brand == None: try: brand = re.findall( '"snparameterdesc":"品牌","snparameterVal":"(.*?)"', para_response)[0] except: brand = None try: X_name = re.findall( '"snparameterdesc":"型号","snparameterVal":"(.*?)"', para_response)[0] except: X_name = None if X_name: if brand: if brand in X_name: X_name = X_name[:0] + re.sub(brand, '', X_name) X_name = X_name[:0] + re.sub(r'(.*?)', '', X_name) X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name) #类别 if X_type == None: try: X_type = re.findall( '"snparameterdesc":"分类","snparameterVal":"(.*?)"', para_response)[0] except: X_type = None #安装方式 if install == None: try: install = re.findall( '"snparameterdesc":"安装方式","snparameterVal":"(.*?)"', para_response)[0] except: install = None #是否直饮 if drink == None: try: drink = re.findall( '"snparameterdesc":"是否直饮","snparameterVal":"(.*?)"', para_response)[0] except: drink = None #滤芯种类 if kinds == None: try: kinds = re.findall( '"snparameterdesc":"滤芯种类","snparameterVal":"(.*?)"', para_response)[0] except: kinds = None #滤芯使用寿命 if life == None: try: life = re.findall( '"snparameterdesc":"滤芯寿命","snparameterVal":"(.*?)"', para_response)[0] except: life = None #过滤精度 if precision == None: try: precision = re.findall( '"snparameterdesc":"过滤精度","snparameterVal":"(.*?)"', para_response)[0] except: precision = None except: pass # 获取相关请求url keyword_url = 'https://review.suning.com/ajax/getreview_labels/general-000000000' + ProductID + '-' + urlID + '-----commodityrLabels.htm' comment_url = 'https://review.suning.com/ajax/review_satisfy/general-000000000' + ProductID + '-' + urlID + '-----satisfy.htm' price_url = 'https://pas.suning.com/nspcsale_0_000000000' + ProductID + '_000000000' + ProductID + '_' + urlID + '_10_010_0100101_20268_1000000_9017_10106_Z001.html' # 获取印象关键字 try: keyword_response = requests.get(keyword_url).text keyword_text = json.loads( re.findall(r'\((.*?)\)', keyword_response)[0]) keyword_list = keyword_text.get('commodityLabelCountList') key_str = '"' keyword = [] for i in range(len(keyword_list)): key_str = key_str[:] + keyword_list[i].get('labelName') if i < len(keyword_list) - 1: key_str = key_str[:] + ' ' if i == len(keyword_list) - 1: key_str = key_str[:] + '"' keyword.append(key_str) except: keyword = None # 获取评价信息 try: comment_response = requests.get(comment_url).text comment_text = json.loads( re.findall(r'\((.*?)\)', comment_response)[0]) comment_list = comment_text.get('reviewCounts')[0] # 差评 PoorCount = comment_list.get('oneStarCount') twoStarCount = comment_list.get('twoStarCount') threeStarCount = comment_list.get('threeStarCount') fourStarCount = comment_list.get('fourStarCount') fiveStarCount = comment_list.get('fiveStarCount') # 评论数量 CommentCount = comment_list.get('totalCount') # 好评 GoodCount = fourStarCount + fiveStarCount # 中评 GeneralCount = twoStarCount + threeStarCount # 好评度 # 得到百分比取整函数 if CommentCount != 0: goodpercent = round(GoodCount / CommentCount * 100) generalpercent = round(GeneralCount / CommentCount * 100) poorpercent = round(PoorCount / CommentCount * 100) commentlist = [GoodCount, GeneralCount, PoorCount] percent_list = [goodpercent, generalpercent, poorpercent] # 对不满百分之一的判定 for i in range(len(percent_list)): if percent_list[i] == 0 and commentlist[ i] != 0 and CommentCount != 0: percent_list[i] = 1 nomaxpercent = 0 # 定义为累计不是最大百分比数值 # 好评度计算url='http://res.suning.cn/project/review/js/reviewAll.js?v=20170823001' if CommentCount != 0: maxpercent = max(goodpercent, generalpercent, poorpercent) for each in percent_list: if maxpercent != each: nomaxpercent += each GoodRateShow = 100 - nomaxpercent else: GoodRateShow = 100 else: PoorCount = 0 CommentCount = 0 GoodCount = 0 GeneralCount = 0 GoodRateShow = 100 except: PoorCount = 0 CommentCount = 0 GoodCount = 0 GeneralCount = 0 GoodRateShow = 100 # 有关价格 try: price_response = requests.get(price_url).text except requests.RequestException as e: # print(e) time.sleep(2) s = requests.session() s.keep_alive = False s.mount('https://', HTTPAdapter(max_retries=5)) price_response = s.get(price_url).text if len(price_response) > 900: try: price = re.findall('"refPrice":"(.*?)"', price_response)[0] PreferentialPrice = re.findall('"promotionPrice":"(.*?)"', price_response)[0] if len(price) < 1: price = re.findall('"netPrice":"(.*?)"', price_response)[0] if price: if float(price) < float(PreferentialPrice): tt = price price = PreferentialPrice PreferentialPrice = tt except: price = None PreferentialPrice = None else: time.sleep(3) price_response = requests.get(price_url).text if len(price_response) > 900: try: price = re.findall('"refPrice":"(.*?)"', price_response)[0] PreferentialPrice = re.findall('"promotionPrice":"(.*?)"', price_response)[0] if len(price) < 1: price = re.findall('"netPrice":"(.*?)"', price_response)[0] if price: if float(price) < float(PreferentialPrice): tt = price price = PreferentialPrice PreferentialPrice = tt except: price = None PreferentialPrice = None else: # 作出失败判断并将url归入重试 price_response = self.retry_price(price_url) if len(price_response) > 500: try: price = re.findall('"refPrice":"(.*?)"', price_response)[0] PreferentialPrice = re.findall( '"promotionPrice":"(.*?)"', price_response)[0] if len(price) < 1: price = re.findall('"netPrice":"(.*?)"', price_response)[0] if price: if float(price) < float(PreferentialPrice): tt = price price = PreferentialPrice PreferentialPrice = tt except: price = None PreferentialPrice = None else: PreferentialPrice = None price = None if kinds: if re.findall(r'\d', kinds) and len(kinds) < 3: level = kinds kinds = None else: level = None else: level = None # 防止出现多个字段出现为空 if p_Name == None and brand == None and type == None: yield None else: source = '苏宁' item['shop_name'] = shop_name item['p_Name'] = p_Name item['X_name'] = X_name item['type'] = type item['price'] = price item['PreferentialPrice'] = PreferentialPrice item['brand'] = brand item['keyword'] = keyword item['PoorCount'] = PoorCount item['CommentCount'] = CommentCount item['GoodCount'] = GoodCount item['GeneralCount'] = GeneralCount item['GoodRateShow'] = GoodRateShow item['install'] = install item['drink'] = drink item['source'] = source item['level'] = level item['kinds'] = kinds item['life'] = life item['precision'] = precision item['color'] = color item['product_url'] = product_url item['ProductID'] = ProductID item['X_type'] = X_type yield item
class HttpNtlmAuth(AuthBase): """HTTP NTLM Authentication Handler for Requests. Supports pass-the-hash.""" def __init__(self, username, password): """ :username - Username in 'domain\\username' format :password - Password or hash in "ABCDABCDABCDABCD:ABCDABCDABCDABCD" format. """ if ntlm is None: raise Exception("NTLM libraries unavailable") #parse the username user_parts = username.split('\\', 1) self.domain = user_parts[0].upper() self.username = user_parts[1] self.password = password self.adapter = HTTPAdapter() def retry_using_http_NTLM_auth(self, auth_header_field, auth_header, response): """Attempts to authenticate using HTTP NTLM challenge/response""" if auth_header in response.request.headers: return response request = response.request # initial auth header with username. will result in challenge auth = 'NTLM %s' % ntlm.create_NTLM_NEGOTIATE_MESSAGE("%s\\%s" % (self.domain,self.username)) request.headers[auth_header] = auth # we must keep the connection because NTLM authenticates the connection, not single requests request.headers["Connection"] = "Keep-Alive" response2 = self.adapter.send(request) # this is important for some web applications that store authentication-related info in cookies (it took a long time to figure out) if response2.headers.get('set-cookie'): headers['Cookie'] = response2.headers.get('set-cookie') # get the challenge auth_header_value = response2.headers[auth_header_field] ServerChallenge, NegotiateFlags = ntlm.parse_NTLM_CHALLENGE_MESSAGE(auth_header_value[5:]) # build response auth = 'NTLM %s' % ntlm.create_NTLM_AUTHENTICATE_MESSAGE(ServerChallenge, self.username, self.domain, self.password, NegotiateFlags) request.headers[auth_header] = auth request.headers["Connection"] = "Close" response = self.adapter.send(request) return response def response_hook(self,r): if r.status_code == 401 and 'ntlm' in r.headers.get('www-authenticate','').lower(): return self.retry_using_http_NTLM_auth('www-authenticate', 'Authorization', r) if r.status_code == 407 and 'ntlm' in r.headers.get('proxy-authenticate','').lower(): return self.retry_using_http_NTLM_auth('proxy-authenticate', 'Proxy-authorization', r) return r def __call__(self,r): r.register_hook('response', self.response_hook) return r
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup as bs import requests from urllib3.util.retry import Retry from requests.adapters import HTTPAdapter outputfile = open('output.txt', 'w') s = requests.Session() s.mount('https://', HTTPAdapter(max_retries=5)) urls = [ 'https://www.changechecker.org/search-results.aspx?denominationId=1&subcategory=Year&subcategoryId=1000', 'https://www.changechecker.org/search-results.aspx?denominationId=3&subcategory=Year&subcategoryId=1000', 'https://www.changechecker.org/search-results.aspx?denominationId=4&subcategory=Sport&subcategoryId=1010', 'https://www.changechecker.org/search-results.aspx?denominationId=15&subcategory=Letter&subcategoryId=-1', 'https://www.changechecker.org/search-results?denominationId=9&subcategory=Year&subcategoryId=1000', 'https://www.changechecker.org/search-results.aspx?denominationId=16&subcategory=Definitive&subcategoryId=1020', 'https://www.changechecker.org/search-results.aspx?denominationId=7&subcategory=Year&subcategoryId=1000', 'https://www.changechecker.org/search-results.aspx?denominationId=5&subcategory=Year&subcategoryId=1000', 'https://www.changechecker.org/search-results.aspx?denominationId=2&subcategory=Year&subcategoryId=1000', 'https://www.changechecker.org/search-results.aspx?denominationId=6&subcategory=Year&subcategoryId=1000' ] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0', } for url in urls: urldata = s.get(url, headers=headers) soup = bs(urldata.text, 'html.parser') elements = soup.find_all('div', class_='divCoinBackground') coinlinks = [] for element in elements: coinlink = 'https://www.changechecker.org/' + element.a['href']
def download_result_file(url, result_file_directory, result_file_name, decompress, overwrite): """ Download file with specified URL and download parameters. :param result_file_directory: The download result local directory name. :type result_file_directory: str :param result_file_name: The download result local file name. :type result_file_name: str :param decompress: Determines whether to decompress the ZIP file. If set to true, the file will be decompressed after download. The default value is false, in which case the downloaded file is not decompressed. :type decompress: bool :param overwrite: Indicates whether the result file should overwrite the existing file if any. :type overwrite: bool :return: The download file path. :rtype: str """ if result_file_directory is None: raise ValueError('result_file_directory cannot be None.') if result_file_name is None: result_file_name="default_file_name" if decompress: name, ext=os.path.splitext(result_file_name) if ext == '.zip': raise ValueError("Result file can't be decompressed into a file with extension 'zip'." " Please change the extension of the result_file_name or pass decompress=false") zip_file_path=os.path.join(result_file_directory, name + '.zip') result_file_path=os.path.join(result_file_directory, result_file_name) else: result_file_path=os.path.join(result_file_directory, result_file_name) zip_file_path=result_file_path if os.path.exists(result_file_path) and overwrite is False: if six.PY3: raise FileExistsError('Result file: {0} exists'.format(result_file_path)) else: raise OSError('Result file: {0} exists'.format(result_file_path)) pool_manager=PoolManager( ssl_version=ssl.PROTOCOL_SSLv3, ) http_adapter=HTTPAdapter() http_adapter.poolmanager=pool_manager s=requests.Session() s.mount('https://', http_adapter) r=s.get(url, stream=True, verify=True) r.raise_for_status() try: with open(zip_file_path, 'wb') as f: for chunk in r.iter_content(chunk_size=4096): if chunk: f.write(chunk) f.flush() if decompress: with contextlib.closing(zipfile.ZipFile(zip_file_path)) as compressed: first=compressed.namelist()[0] with open(result_file_path, 'wb') as f: f.write(compressed.read(first)) except Exception as ex: raise ex finally: if decompress and os.path.exists(zip_file_path): os.remove(zip_file_path) return result_file_path
def mount(self, pool_connections=5, pool_maxsize=120): self.request.mount('https://', HTTPAdapter(pool_connections, pool_maxsize))
def __init__(self, host='localhost', port=1317, tls=False): self.host = host self.port = port self.tls = tls self.session = requests.Session() self.session.mount(self.host, HTTPAdapter(max_retries=MAX_RETRIES))
subs = "%s" else: warn("No recognized database specified: {}".format(args.database)) sys.exit() format_url = "http://eol.org/data_objects/{}" rating_subexp = r"(?:.+?(\d) star.+?^(\d+))" rating_regexp = r"^<dl class=.rating_counts.>{0}{0}{0}{0}{0}".format( rating_subexp) rating_flags = re.MULTILINE | re.DOTALL sess = requests.Session() retries = Retry(total=args.retries, backoff_factor=1, status_forcelist=[500, 502, 503, 504]) sess.mount('http://', HTTPAdapter(max_retries=retries)) db_curs = db_connection.cursor() for im_tab in args.images_table: db_curs.execute( "SELECT DISTINCT src_id FROM {} WHERE src_id IS NOT NULL AND src = {} ORDER BY (rating_confidence IS NOT NULL), updated DESC;" .format(im_tab, subs), args.eol_src_flag) rows = db_curs.fetchall() for i, row in enumerate(rows): EOL_data_object_id = int(row[0]) if args.verbosity: print("{}: looking for ratings for EoL data object {}".format( i, EOL_data_object_id)) response = sess.get(format_url.format(EOL_data_object_id), timeout=10) m = re.search(rating_regexp, response.text, rating_flags) if m:
""" from typing import Any, Dict, List import requests from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter, Retry MAX_RETRIES = 16 retry_strategy = Retry( total=MAX_RETRIES, status_forcelist=[429, 500, 502, 503, 504], ) SESSION = requests.Session() SESSION.mount("http://", HTTPAdapter(max_retries=retry_strategy)) SESSION.mount("https://", HTTPAdapter(max_retries=retry_strategy)) class FetchDemandError(Exception): """ Object for demand fetching exceptions. """ # pylint: disable=unnecessary-pass pass def get_dates(season: str) -> List[str]: """ Get dates with available course demand.
import os import time from multiprocessing import Process # gevent.monkey.patch_all() reload(sys) headers = dict() headers[ "User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" headers[ "Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" headers["Accept-Encoding"] = "gzip, deflate, sdch" headers["Accept-Language"] = "zh-CN,zh;q=0.8" headers["Accept-Language"] = "zh-CN,zh;q=0.8" request_retry = HTTPAdapter(max_retries=3) def my_get(url, refer=None): session = requests.session() session.headers = headers if refer: headers["Referer"] = refer session.mount('https://', request_retry) session.mount('http://', request_retry) return session.get(url) def get_type_content(page): if page < 2: url = 'http://www.netbian.com/index.htm'
def _rest_request(self, url, method, params=None, body=None, fullresponse=False, use_base_url=True): # base request method for a REST request myheaders = {"User-Agent": "api.py"} if method in ["POST", "PUT"]: myheaders.update({'Content-type': 'application/json'}) retry_strategy = Retry(total=3, status_forcelist=[429, 500, 502, 503, 504], method_whitelist=["HEAD", "GET", "OPTIONS"]) session = requests.Session() session.mount(self.base_rest_url, HTTPAdapter(max_retries=retry_strategy)) if use_base_url: url = self.base_rest_url + url try: if method == "GET": request = requests.Request( method, url, params=params, auth=RequestsAuthPluginVeracodeHMAC(), headers=myheaders) prepared_request = request.prepare() r = session.send(prepared_request, proxies=self.proxies) elif method == "POST": r = requests.post(url, params=params, auth=RequestsAuthPluginVeracodeHMAC(), headers=myheaders, data=body) elif method == "PUT": r = requests.put(url, params=params, auth=RequestsAuthPluginVeracodeHMAC(), headers=myheaders, data=body) elif method == "DELETE": r = requests.delete(url, params=params, auth=RequestsAuthPluginVeracodeHMAC(), headers=myheaders) else: raise VeracodeAPIError("Unsupported HTTP method") except requests.exceptions.RequestException as e: logger.exception(self.connect_error_msg) raise VeracodeAPIError(e) if not (r.status_code == requests.codes.ok): logger.debug( "API call returned non-200 HTTP status code: {}".format( r.status_code)) if not (r.ok): logger.debug("Error retrieving data. HTTP status code: {}".format( r.status_code)) if r.status_code == 401: logger.exception( "Error [{}]: {} for request {}. Check that your Veracode API account credentials are correct." .format(r.status_code, r.text, r.request.url)) else: logger.exception("Error [{}]: {} for request {}".format( r.status_code, r.text, r.request.url)) raise requests.exceptions.RequestException() if fullresponse: return r elif r.text != "": return r.json() else: return ""
def __init__(self, username, password): self.auth = (username, password) self.http = HTTPAdapter()