def fetch(tasks): task, results, s = tasks s.headers.update({'User-Agent': random.choice(USER_AGENTS)}) print(task) url = "https://www.amazon.com/dp/" + task print(url) s.proxies.update({"http": get_proxy()}) while 1: res = s.get(url) if res.status_code == 404: return elif res.status_code != 200: print(res.status_code) s.get("http://123.207.17.216:5000/delete") s = api_s print('IP ban,change IP') continue else: break html = res.content.decode('utf-8', 'ignore') result = deal_bussines(html) if result != {}: db(result, task) else: print("result = ", results) return
def download_favicon(self): # make cache directory if needed try: dir, name = os.path.split(self.favicon_path) os.makedirs(dir) except Exception: pass # try to download the favicon try: opener = urllib.request.build_opener(util.get_proxy()) f = opener.open(self.favicon_url) data = f.read() f.close() f = open(self.favicon_path, 'wb') f.write(data) f.close() except Exception: pass
def download_favicon(self): # make cache directory if needed try: dir, name = os.path.split(self.favicon_path) os.makedirs(dir) except Exception: pass # try to download the favicon try: opener = urllib2.build_opener(util.get_proxy()) f = opener.open(self.favicon_url) data = f.read() f.close() f = open(self.favicon_path, 'wb') f.write(data) f.close() except Exception: pass
def _search(self, keyword, p, time_out, ct=True): time.sleep(time_out) self._query.update({'query': keyword, 'page': str(p)}) if ct: self._query.update({'t': str(int(time.time() * 1000))}) while True: try: resp = requests.get(self._url, headers=self._headers, params=self._query, proxies=get_proxy(), timeout=300) break except Exception: continue if self._check_anti_spider(resp): print(Error_message.terminate_mess) exit() content = parse_wx_url_from(resp) print(f'第{p}页搜索结果 -- 获取链接数量: {len(content)}') if len(content) == 0: self.ct_status = not self.ct_status self._push(content)
s.headers.update({"Accept-Encoding": "gzip"}) s.headers.update({"Host": "www.amazon.com"}) s.headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" }) s.headers.update({"Origin": "https://www.amazon.com"}) s.headers.update({"X-Requested-With": "XMLHttpRequest"}) s.headers.update( {"Content-Type": "application/x-www-form-urlencoded;charset=UTF-8"}) s.headers.update({"Connection": "keep-alive"}) cursor.execute("select distinct asin from amz_review_task") tasks = cursor.fetchall() conn.commit() results = [] pool = ThreadPoolExecutor(max_workers=5) # for circle in len(tasks)//5 tasks = [(x[0], results, s) for x in tasks] start = 0 end = 5 for num in range(len(tasks) // 5 + 1): proxy = get_proxy() if proxy == "http://0": print("changing ip ") change_ip() proxy = get_proxy() print(proxy) pool.map(fetch, tasks[start:end]) start += 5 end += 5