def _deal_datas(self, datas): if self._to_md5: if isinstance(datas, list): keys = [get_md5(data) for data in datas] else: keys = get_md5(datas) else: keys = copy.deepcopy(datas) return keys
def get_proxy_from_http(proxy_source_url, **kwargs): """ 从指定 http 地址获取代理 :param proxy_source_url: :param kwargs: :return: """ filename = tools.get_md5(proxy_source_url) + ".txt" abs_filename = os.path.join(proxy_path, filename) update_interval = kwargs.get("local_proxy_file_cache_timeout", 60) update_flag = 0 if not update_interval: # 强制更新 update_flag = 1 elif not os.path.exists(abs_filename): # 文件不存在则更新 update_flag = 1 elif time.time() - os.stat(abs_filename).st_mtime > update_interval: # 超过更新间隔 update_flag = 1 if update_flag: response = requests.get(proxy_source_url, timeout=20) with open(os.path.join(proxy_path, filename), "w") as f: f.write(response.text) return get_proxy_from_file(filename)
def __init__(self, user_agent=None, proxies=None, cookies=None, **kwargs): self.__dict__.update(kwargs) self.user_agent = user_agent self.proxies = proxies self.cookies = cookies self.user_id = kwargs.get("user_id") or get_md5( user_agent, proxies, cookies)
def fingerprint(self): args = [] for key, value in self.to_dict.items(): if value: if (self.unique_key and key in self.unique_key) or not self.unique_key: args.append(str(value)) if args: args = sorted(args) return tools.get_md5(*args) else: return None
def fingerprint(self): """ request唯一表识 @return: """ args = [self.__dict__.get("url", "")] params = self.requests_kwargs.get("params") datas = self.requests_kwargs.get("data") if params: args.append(str(params)) if datas: args.append(str(datas)) return tools.get_md5(*args)
def fingerprint(self): """ request唯一表识 @return: """ url = self.__dict__.get("url", "") # url 归一化 url = tools.canonicalize_url(url) args = [url] for arg in ["params", "data", "files", "auth", "cert", "json"]: if self.requests_kwargs.get(arg): args.append(self.requests_kwargs.get(arg)) return tools.get_md5(*args)
def fingerprint(self): """ request唯一表识 @return: """ url = self.__dict__.get("url", "") # url 归一化 url = tools.canonicalize_url(url) args = [url] params = self.requests_kwargs.get("params") datas = self.requests_kwargs.get("data") if params: args.append(str(params)) if datas: args.append(str(datas)) return tools.get_md5(*args)
def run(): while True: redisdb = RedisDB() try: block_ip = redisdb.sget(setting.CAPTCHA_BLOCK_IP_REDIS_KEY) if not block_ip: log.debug("暂无被封ip") for ip in block_ip: task = redisdb.hget(setting.CAPTCHA_REDIS_KEY, ip, is_pop=True) task = eval(task) ua = task.get("ua") url = task.get("url") with WebDriver(proxy=ip, user_agent=ua) as browser: log.info("解封ip {}, url {}".format(ip, url)) browser.get(url) browser.implicitly_wait(5) frame = browser.find_element_by_id("tcaptcha_iframe") browser.switch_to.frame(frame) for i in range(20): for i in range(1000): bg_url = browser.find_element_by_id( "slideBg").get_attribute("src") slide_url = browser.find_element_by_id( "slideBlock").get_attribute("src") if bg_url and slide_url: break else: log.error("滑块加载失败") return bg_image = os.path.join( CAPTCHA_PATH, "bg_" + tools.get_md5(bg_url) + ".png") slide_image = os.path.join( CAPTCHA_PATH, "slider_" + tools.get_md5(slide_url) + ".png") if tools.download_file( bg_url, bg_image) and tools.download_file( slide_url, slide_image): # 识别缺口 x, y = get_gap_center_point(bg_image, slide_image, show=False) # 缩放 x = x * 340 / 680 x = x - 27.5 - 30 # 滑动 slide_btn = browser.find_element_by_id( "tcaptcha_drag_thumb") tracks = track.get_tracks(x) drag_and_drop(browser, slide_btn, tracks) # 删除图片 os.remove(bg_image) os.remove(slide_image) tools.delay_time(2) if "verify.maoyan.com" not in browser.current_url: log.info("解封成功") break else: try: browser.find_element_by_css_selector( ".tc-action-icon").click() except: pass tools.delay_time(3) except Exception as e: log.error(e)