Пример #1
0
def fetch(tasks):
    task, results, s = tasks
    s.headers.update({'User-Agent': random.choice(USER_AGENTS)})
    print(task)
    url = "https://www.amazon.com/dp/" + task
    print(url)
    s.proxies.update({"http": get_proxy()})
    while 1:
        res = s.get(url)
        if res.status_code == 404:
            return
        elif res.status_code != 200:
            print(res.status_code)
            s.get("http://123.207.17.216:5000/delete")
            s = api_s
            print('IP ban,change IP')
            continue
        else:
            break
    html = res.content.decode('utf-8', 'ignore')
    result = deal_bussines(html)
    if result != {}:
        db(result, task)
    else:
        print("result = ", results)
        return
Пример #2
0
 def download_favicon(self):
     # make cache directory if needed
     try:
         dir, name = os.path.split(self.favicon_path)
         os.makedirs(dir)
     except Exception:
         pass
     # try to download the favicon
     try:
         opener = urllib.request.build_opener(util.get_proxy())
         f = opener.open(self.favicon_url)
         data = f.read()
         f.close()
         f = open(self.favicon_path, 'wb')
         f.write(data)
         f.close()
     except Exception:
         pass
Пример #3
0
 def download_favicon(self):
     # make cache directory if needed
     try:
         dir, name = os.path.split(self.favicon_path)
         os.makedirs(dir)
     except Exception:
         pass
     # try to download the favicon
     try:
         opener = urllib2.build_opener(util.get_proxy())
         f = opener.open(self.favicon_url)
         data = f.read()
         f.close()
         f = open(self.favicon_path, 'wb')
         f.write(data)
         f.close()
     except Exception:
         pass
Пример #4
0
 def _search(self, keyword, p, time_out, ct=True):
     time.sleep(time_out)
     self._query.update({'query': keyword, 'page': str(p)})
     if ct:
         self._query.update({'t': str(int(time.time() * 1000))})
     while True:
         try:
             resp = requests.get(self._url,
                                 headers=self._headers,
                                 params=self._query,
                                 proxies=get_proxy(),
                                 timeout=300)
             break
         except Exception:
             continue
     if self._check_anti_spider(resp):
         print(Error_message.terminate_mess)
         exit()
     content = parse_wx_url_from(resp)
     print(f'第{p}页搜索结果 -- 获取链接数量: {len(content)}')
     if len(content) == 0:
         self.ct_status = not self.ct_status
     self._push(content)
Пример #5
0
    s.headers.update({"Accept-Encoding": "gzip"})
    s.headers.update({"Host": "www.amazon.com"})
    s.headers.update({
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
    })
    s.headers.update({"Origin": "https://www.amazon.com"})
    s.headers.update({"X-Requested-With": "XMLHttpRequest"})
    s.headers.update(
        {"Content-Type": "application/x-www-form-urlencoded;charset=UTF-8"})
    s.headers.update({"Connection": "keep-alive"})
    cursor.execute("select distinct asin from amz_review_task")
    tasks = cursor.fetchall()
    conn.commit()
    results = []
    pool = ThreadPoolExecutor(max_workers=5)
    # for circle in len(tasks)//5
    tasks = [(x[0], results, s) for x in tasks]
    start = 0
    end = 5
    for num in range(len(tasks) // 5 + 1):
        proxy = get_proxy()
        if proxy == "http://0":
            print("changing ip ")
            change_ip()
            proxy = get_proxy()
            print(proxy)
        pool.map(fetch, tasks[start:end])
        start += 5
        end += 5