def get_product_list(keys, sorttype, num=0): global proxies, headers, currentPage, pageSize # 获取商品列表信息的url rooturl = 'https://s.taobao.com/search?ajax=true&callback=%s&tab=all&style=list&q=%s&sort=%s&s=%d' % ( get_random(), keys, sorttype, num) logging.info(rooturl) print(rooturl) try: res = requests.get(rooturl, proxies=proxies, headers=headers).text # res = requests.get(rooturl).text jsonp_str = res[2:7] # 判断是否返回json格式的结果,如果结果正常,继续解析执行,如果异常更换ip重新访问 if jsonp_str == 'jsonp': json_str = res[12:][:-2] data = json.loads(json_str) totalPage = data['mods']['pager']['data']['totalPage'] # 总页数 currentPage = data['mods']['pager']['data']['currentPage'] # 当前页 pageSize = data['mods']['pager']['data']['pageSize'] # 每页显示数量 # 解析并保存当前页的数据 products = data['mods']['itemlist']['data']['auctions'] insert_product_list(products, currentPage, keys, sorttype) # 判断是否到最后一页 print(currentPage, ':', totalPage) if currentPage == totalPage: return else: # 跳转到下一页 get_product_list(keys, sorttype, currentPage * pageSize) else: # print(jsonp_str) # 随机更换ip 继续访问 # IndentationError proxies['http'] = proxy.get_random_proxy() headers = headersobj.get_random_header() callback('1', currentPage, pageSize, keys, sorttype) except Exception as e: # raise e proxies['http'] = proxy.get_random_proxy() headers = headersobj.get_random_header() logging.debug(e) print(e) callback('2', currentPage, pageSize, keys, sorttype) except TimeoutError as t: # raise t proxies['http'] = proxy.get_random_proxy() headers = headersobj.get_random_header() logging.debug(t) callback(currentPage, pageSize, keys, sorttype)
# 保存数据 conn = mysql.connector.connect(user='******', password='******', database='taobao') cursor = conn.cursor() ''' 打开浏览器 ''' browser = webdriver.Chrome() wait = WebDriverWait(browser, 5) browser.maximize_window() # 更新代理 proxies = {"http": ''} headers = headersobj.get_random_header() # 1、根据关键字和排序信息查询商品列表 def get_product_list(keys, sorttype, num=0): global proxies, headers # 获取商品列表信息的url rooturl = 'https://s.taobao.com/search?ajax=true&callback=%s&tab=all&style=list&q=%s&sort=%s&s=%d' % ( get_random(), keys, sorttype, num) logging.info(rooturl) print(rooturl) currentPage = 1 pageSize = 44 try: browser.get(rooturl)