def __init__(self, type=1, status=1): self.status = 1 self.type = type if self.type == 1: chrome_options = webdriver.ChromeOptions() if spider_config.getint('proxyconf', 'switch') >= 1: proxy_type = spider_config.getint('proxyconf', 'type') if proxy_type == 1: chrome_options.add_extension( get_chrome_proxy_extension(spider_config.get('proxy1', 'dynamicProxy'))) if proxy_type == 2: data = json.dumps({"targetSiteName": spider_config.get("proxy2", "targetSiteName")}) request = urllib2.Request(url=spider_config.get("proxy2", "dynamicProxy"), data=data) response = urllib2.urlopen(request) proxy_data = json.loads(response.read()) logging.info("返回代理:"+json.dumps(proxy_data)) proxy_ip = proxy_data["proxy"]["IP"] proxy_port = proxy_data["proxy"]["port"] chrome_options.add_argument('--proxy-server=http://' + proxy_ip + ":" + proxy_port) prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) self.browser = webdriver.Chrome(spider_config.get('browser', 'chromedriver'), chrome_options=chrome_options) elif self.type == 2: self.browser = webdriver.Firefox()
def switch_proxy_ip(): try: if spider_config.getint("proxyconf", "type") == 1 and spider_config.getint( "proxyconf", "switch") > 0: req = urllib2.Request(spider_config.get('proxy1', 'switchProxy')) res_data = urllib2.urlopen(req) res = res_data.read() logger.info('切换代理IP:' + res) except: logger.error('切换代理IP失败')
def get_shopping_result(self, browser, req): browser.get(self.get_shopping_url(req)) if_loaded = 'if (typeof document.getElementById("table-0") == "undefined") return null; else return document.getElementById("table-0")' result = browser.execute_script(if_loaded) t = 0 timeout = spider_config.getint('spider', 'timeout') if timeout < 10: timeout = 10 while not result and t <= timeout: t = t + 1 self.logger.info('sleep {0}'.format(t)) result = browser.execute_script(if_loaded) time.sleep(1) if result: result = browser.execute_script(js_tiger_shopping) result["status"] = 0 else: result = {"status": 1} self.logger.debug(result) return result
def get_shopping_result(self, browser, req): browser.get(self.get_shopping_url(req)) if req.entry == 'mobile': if_loaded = 'if (typeof document.getElementById("fareflight") == "undefined") return null; else return document.getElementById("fareflight")' else: if_loaded = 'if (typeof document.getElementsByClassName("js_availability_container") == "undefined") return null; else return document.getElementsByClassName("js_availability_container")' result = browser.execute_script(if_loaded) t = 0 timeout = spider_config.getint('spider', 'timeout') if timeout < 10: timeout = 10 while not result and t <= timeout: t = t + 1 self.logger.info('sleep {0}'.format(t)) result = browser.execute_script(if_loaded) time.sleep(1) if result: if req.parser == 'python': result = parse_pc_shopping(browser, req) elif req.entry == 'mobile': result = browser.execute_script(js_mobile_shopping) else: result = browser.execute_script(js_pc_shopping) result["status"] = 0 else: result = {"status": 1} self.logger.debug(result) return result
def get_shopping_result(proxies, req): url = "https://booking.tigerair.com.au/TigerAirIBE/Booking/Search" # 第一次请求获取cookie和token,用于第二次请求 r = requests.get(url, proxies=proxies) doc = pq(r.text) temp_token = doc("[name='__RequestVerificationToken']").val() if req.flightOption == 1: trip_kind = "oneWay" else: trip_kind = "roundTrip" data = {'__RequestVerificationToken': temp_token, 'TripKind': trip_kind, 'Destination': req.toCity, 'Origin': req.fromCity, 'DepartureDate': TigerHttp.date_format(req.startDate), 'AdultCount': str(req.adultNumber), 'ChildCount': str(req.childNumber)} try: r = requests.post(url=url, data=data, proxies=proxies, timeout=spider_config.getint("spider", "timeout"), cookies=r.cookies) doc = pq(r.text) return TigerHttp.start(doc, req) except requests.exceptions.ConnectTimeout: return {"status": 1} except requests.exceptions.Timeout: return {"status": 1}
class SchedulerConfig(object): JOBS = [{ 'id': 'proxy', 'func': '__main__:switch_proxy_ip', 'args': None, 'trigger': 'interval', 'seconds': spider_config.getint('proxy1', 'switchTime') }]
def get_proxies(self): proxies = None if spider_config.getint("proxyconf", "switch") > 0: if len(self._ProxiesPool) > 0: cache_proxy = self._ProxiesPool.pop() self._ProxiesPool.add(cache_proxy) cache_proxy = cache_proxy.split(r"""://""")[1] return { "http": "http://" + cache_proxy, "https": "http://" + cache_proxy } try: if spider_config.getint("proxyconf", "type") == 1: proxy_inf = spider_config.get("proxy1", "dynamicProxy") proxies = { "http": "http://" + proxy_inf, "https": "http://" + proxy_inf } if spider_config.getint("proxyconf", "type") == 2: data = json.dumps({ "targetSiteName": spider_config.get("proxy2", "targetSiteName") }) request = urllib2.Request(url=spider_config.get( "proxy2", "dynamicProxy"), data=data) response = urllib2.urlopen(request) proxy_data = json.loads(response.read()) logging.info("ip" + json.dumps(proxy_data)) proxy_ip = proxy_data["proxy"]["IP"] proxy_port = proxy_data["proxy"]["port"] proxies = { "http": "http://" + proxy_ip + ":" + proxy_port, "https": "http://" + proxy_ip + ":" + proxy_port } except: self._logger.error("获取动态代理失败") return proxies
def acquire_browser(self, type): self._lock.acquire() result = None for bs in self._driverPool: if bs.status == 0 and bs.type == type: bs.status = 1 result = bs.browser break max_thread = spider_config.getint('spider', 'maxThread') if result is None and len(self._driverPool) < max_thread: result = WebDriverInfo(type=type) self._driverPool.append(result) result = result.browser self._lock.release() return result
str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(startTime))) + '-----结束时间:' + str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(endTime))) + '--------------------共消耗' + str((endTime - startTime) * 1000) + 'ms') return result def switch_proxy_ip(): try: if spider_config.getint("proxyconf", "type") == 1 and spider_config.getint( "proxyconf", "switch") > 0: req = urllib2.Request(spider_config.get('proxy1', 'switchProxy')) res_data = urllib2.urlopen(req) res = res_data.read() logger.info('切换代理IP:' + res) except: logger.error('切换代理IP失败') if __name__ == '__main__': # scheduler = APScheduler() # app.config.from_object(SchedulerConfig()) # scheduler.init_app(app) # scheduler.start() port = spider_config.getint('server', 'port') logger.info('get port: {0}'.format(port)) app.run(host="0.0.0.0", port=port, debug=False)