def get_proxy(http_type='http'): """ 获取可以访问的ip """ proxy = {'type': http_type, 'anonymity': 'high'} url = "https://www.forbes.com/innovation/" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } timeout = 20 while True: logger.info("Start proxy_pool.get_single_proxy") proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: logger.info("proxy_pool.get_single_proxy return None") continue proxies = { 'https': proxy_ip['ip:port'] } try: r = requests.get(url, headers=headers, proxies=proxies, timeout=timeout) if r.text.find('Forbes Welcome') >= 0: logger.info('ip:%s for forbes' % proxy_ip['ip:port']) return proxies except Exception, e: logger.info('Proxy Exception:%s' % e)
def get_proxy_http(): proxy = {'type': 'http', 'anonymity': 'high'} proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: logger.info("No proxy !!!!!!!!!!!!!!!!!!!") time.sleep(30) return proxy_ip
def get_proxy(http_type): proxy = {'type': http_type, 'anonymity': 'high'} proxy_ip = None while proxy_ip is None: print("Start proxy_pool.get_single_proxy") proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: print("proxy_pool.get_single_proxy return None") print(proxy_ip['ip:port']) return {proxy_ip['ip']: proxy_ip['port']}
def get_proxy(): #proxy = {'type': 'http', 'anonymity': 'high', 'ping': 1, 'transferTime': 1, 'country': 'cn'} proxy = {'type': 'http', 'anonymity': 'high'} proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: time.sleep(60) #pass return proxy_ip
def request(url,callback): # proxy = {'type': 'https', 'anonymity':'high', 'ping':1, 'transferTime':5} proxy = {'type': 'https', 'anonymity':'high'} proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: time.sleep(60) http_client.fetch(url, callback, proxy_host=proxy_ip["ip"], proxy_port=int(proxy_ip["port"]))
def request(url,callback): #proxy = {'type': 'http', 'anonymity':'high', 'ping':1, 'transferTime':5} proxy = {'type': 'http', 'anonymity':'high'} proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: time.sleep(60) logger.info("crawler: %s",url) http_client.fetch(url, callback, headers=headers, proxy_host=proxy_ip["ip"], proxy_port=int(proxy_ip["port"]), request_timeout=10, connect_timeout=10)
def request(url, callback): global total proxy = {'type': 'http', 'anonymity': 'high'} proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: time.sleep(60) http_client.fetch(url, callback, proxy_host=proxy_ip["ip"], proxy_port=int(proxy_ip["port"]), request_timeout=10)
def get_proxy(self, http_type): proxy = {'type': http_type, 'anonymity':'high'} proxy_ip = None while proxy_ip is None: logger.info("Start proxy_pool.get_single_proxy %s", self.num) proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: logger.info("proxy_pool.get_single_proxy return None") if socket.socket.__module__ == "gevent.socket": gevent.sleep(30) else: time.sleep(30) return proxy_ip
def get_proxy(self): proxy = { "$or": [{ 'type': 'socks4' }, { 'type': 'socks5' }], 'anonymity': 'high' } proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: time.sleep(30) return proxy_ip
def get_proxy(): proxy = { "$or": [{ 'type': 'socks4' }, { 'type': 'socks5' }], 'anonymity': 'high' } proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: logger.info("No proxy !!!!!!!!!!!!!!!!!!!") time.sleep(30) return proxy_ip
def request(url, callback): # proxy = {'type': 'https', 'anonymity':'high', 'ping':1, 'transferTime':5} if url.find("https") >= 0: proxy = {'type': 'https', 'anonymity': 'high'} else: proxy = {'type': 'http', 'anonymity': 'high'} proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: time.sleep(60) # logger.info("Getting :%s",url) http_client.fetch(url, callback, proxy_host=proxy_ip["ip"], proxy_port=int(proxy_ip["port"]), request_timeout=10, connect_timeout=10)
def request(url, callback): #proxy = {'type': 'http', 'anonymity':'high', 'ping':1, 'transferTime':5} proxy = {'type': 'http', 'anonymity': 'high'} proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: time.sleep(60) #logger.info(url) http_header = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36' } http_client.fetch(url, callback, proxy_host=proxy_ip["ip"], proxy_port=int(proxy_ip["port"]), request_timeout=60, headers=http_header)
def init_http_session(self,url): if url.lower().startswith("https"): http_type = "https" else: http_type = "http" proxy = {'type': http_type, 'anonymity':'high'} proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: time.sleep(60) logger.info("Proxy IP(%s): %s" % (http_type, proxy_ip)) if self.http_session is None: self.http_session = requests.Session() self.http_session.proxies={http_type:"%s://%s:%s" % (http_type, proxy_ip["ip"], proxy_ip["port"])} if self.header: self.http_session.headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36"
def request(name, callback): global total proxy = {'type': 'http', 'anonymity': 'high'} proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: time.sleep(60) if name is None: total -= 1 logger.info(total) if total <= 0: begin() return url = 'http://index.so.com/index.php?a=overviewJson&q=' + name + '&area=%E5%85%A8%E5%9B%BD' http_client.fetch(url, callback, proxy_host=proxy_ip["ip"], proxy_port=int(proxy_ip["port"]), request_timeout=10)
def get_proxy(): while True: try: item = proxy_pool.get_single_proxy(Rule) ip, port = item['ip'], item['port'] ip_port = ip + ':' + str(port) logger.info('%s:%s' % (ip, port)) url = "http://www.cyzone.cn/event" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } timeout = 10 proxies = { 'http': ip_port } r = requests.get(url, headers=headers, proxies=proxies, timeout=timeout) if r.text.find('cyzone') >= 0: logger.info('ip:%s for cyzone' % ip_port) return proxies except Exception, e: logger.info('Proxy Exception:%s' % e)
def get_proxy2(): while True: try: item = proxy_pool.get_single_proxy(Rule2) ip, port = item['ip'], item['port'] ip_port = ip + ':' + str(port) print('%s:%s' % (ip, port)) url = "https://www.baidu.com/" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } timeout = 10 proxies = { 'https': ip_port } r = requests.get(url, headers=headers, proxies=proxies, timeout=timeout) if r.text.find('hao123') >= 0: print('ip:%s for qimai' % ip_port) return ip, port except Exception, e: print('Proxy Exception:%s' % e)
def get_session(proxy, new, agent): global http_session if new or http_session == None: http_session = requests.Session() #http_session.mount('http', HTTPAdapter(max_retries=5)) if agent: user_agent = get_a_user_agent() # print user_agent http_session.headers["User-Agent"] = user_agent proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) if proxy_ip is None: time.sleep(60) logger.info(proxy_ip) #http_session.proxies={proxy['type']:"http://%s:%s" % (proxy_ip["ip"], proxy_ip["port"])} http_session.proxies={"http":"http://%s:%s" % (proxy_ip["ip"], proxy_ip["port"]), "https":"http://%s:%s" % (proxy_ip["ip"], proxy_ip["port"])} return http_session
def request(url, callback): proxy = {"http_type": "Socks4"} proxy_ip = None while proxy_ip is None: proxy_ip = proxy_pool.get_single_proxy(proxy) #logger.info(proxy_ip) if proxy_ip is None: time.sleep(60) headers = {} headers[ "User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" headers["Host"] = "itunes.apple.com" headers["Accept-Language"] = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3" headers["Accept-Encoding"] = "" if proxy["http_type"].lower() == "socks4": http_request = tornado.httpclient.HTTPRequest( url, prepare_curl_callback=prepare_curl_socks4, proxy_host=proxy_ip["ip"], proxy_port=int(proxy_ip["port"]), headers=headers, validate_cert=False, request_timeout=10, connect_timeout=10) else: http_request = tornado.httpclient.HTTPRequest( url, prepare_curl_callback=prepare_curl_socks5, proxy_host=proxy_ip["ip"], proxy_port=int(proxy_ip["port"]), headers=headers, request_timeout=10, connect_timeout=10) logger.info("Proxy: %s:%s", proxy_ip["ip"], proxy_ip["port"]) http_client.fetch(http_request, callback)