def get_from_unuseless_phonenum(): r = set_redis(2) # for i in r.hkeys("unuseless_cookies_car168"): # r.lpush("cookies_car168_list", i) for k, v in r.hgetall("unuseless_cookies_car168").items(): print(k, v) r.hset("cookies_car168", k, v)
def clicklogin(driver, phone_num, projectid, token, matchrule): # 手机号页面 phone_input = driver.find_element_by_id("uname") phone_input.send_keys(phone_num) # 点击获取验证码 sendcode = driver.find_element_by_id("sendCode") sendcode.click() # 接收验证码, 处理下接受不到短信的手机号 code = get_code(projectid, phonenum=phone_num, token=token, matchrule=matchrule) if code: # 输入验证码 code_input = driver.find_element_by_id("code") code_input.send_keys(code) login = driver.find_element_by_id("button") login.click() sleep(2) driver.refresh() tbCookies = driver.get_cookies() # driver.quit() cookies = {} for item in tbCookies: if item['name'] == "DEVICE_ID" or item['name'] == "U": cookies[item['name']] = item['value'] print(cookies) r = set_redis(2) # mapping = {json.dumps(cookies): int(phone_num)} # print(mapping) # r.zadd("cookies_car168", mapping) # 将cookie值存入redis list 和hash dump_cookies = json.dumps(cookies) r.lpush("cookies_car168_list", dump_cookies) r.hset("cookies_car168", dump_cookies, phone_num) print("成功存入redis") driver.quit() else: print("将无用注册手机号存入redis") set_redis(2).sadd("unuseless_car168_phonenum", phone_num) print("没有接收到验证码")
def get_phone(): phone_num = set_redis(2).spop("car168_phonenum") # phone_num_list = []#"17169479357""16535511249","16574980930","16531165344","16533431172",] # phone_num = phone_num_list[0] print("从列表中取得手机号:", phone_num) if phone_num: phone_num = phone_num.decode() # token = get_token('maxfire', "ma123456") # phone_num_from = get_phonenum(projectid, token=token, phone=phone_num, loop=2) token, phone_num_from = build_phonenum(projectid, loop=2, phone=phone_num) print("解码平台手机号:", phone_num_from) return token, phone_num_from, phone_num
class MysqlPipeline(object): save_url_r = set_redis(db=1) def open_spider(self, spider): self.conn = pymysql.connect( host=HOST, port=3306, db=MYDB, user=USER, password=PASSWD, charset=charset, ) self.cur = self.conn.cursor() def close_spider(self, spider): self.cur.close() self.conn.close() def process_item(self, item, spider): selflog = SelfLog(spider.name) selflog_error = SelfLog("error") keys = [ "brand", "type", "year", "style", "guide_price", "displacement", "configuration", "version", "status" ] values = [item[i] for i in keys] # 插入车型数据前先在数据库中进行匹配是否存在,若存在则判断是否符合更新条件进行更新, 若不存在则保存 sql_search_style = "select id, brand, type from car_style where brand=%s and type=%s and year=%s and style=%s and guide_price=%s" search_db_result = Mysqlpython().readall(sql_search_style, [ item["brand"], item['type'], item['year'], item['style'], item['guide_price'] ]) if search_db_result: commit_id = search_db_result[0][0] # selflog.logger.info("出现的重复车型:%s", search_db_result[0]) update_set = "update car_style set status=null " update_where = " where id=%s " sqlparam = [commit_id] if not item['guide_price'] and item['guide_price'] != "None": update_set += " , guide_price=%s" update_where += " and (guide_price is null or guide_price='None')" sqlparam.append(item['guide_price']) if not item['displacement'] and item['displacement'] != "None": update_set += " , displacement=%s" update_where += " and (displacement is null or displacement='None')" sqlparam.append(item['displacement']) if item['version'] and item['version'] != "None": update_set += " , version=%s" update_where += " and (version is null or version='None')" sqlparam.append(item['version']) # 如果update_set 还是原来的值则不执行更新操作 if update_set == "update car_style set status=null ": pass else: sql_update = update_set + update_where # selflog.logger.info( # "执行更新sql:{sql_update}, values:{values}, id:{commit_id}".format(sql_update=sql_update, # values=sqlparam, # commit_id=commit_id)) try: self.cur.execute(sql_update, sqlparam) self.conn.commit() except Exception as e: selflog_error.logger.error( "更新sql出错:{sql_update}, values:{values}, id:{commit_id},e:{e}" .format(sql_update=sql_update, values=sqlparam, commit_id=commit_id, e=e)) # 如果查不到则进行插入车型、价钱、更新时间等详情操作 else: # 插入车型表 sql = "insert into `{}` ({}) values ({})".format( item.table_name, ','.join(keys), # 使用占位符插入的方式是保证兼容除字符串格式以外数据 ','.join(['%s'] * len(values)), ) try: self.cur.execute(sql, values) self.conn.commit() except Exception as e: selflog_error.logger.info( "{spidername} 插入车型表出错e:{e}sql:{sql}, --values:{values}". format(spidername=spider.name, e=e, sql=sql, values=values)) commit_id = None else: commit_id = self.cur.lastrowid # 如果没有车价格和交易量则不进行插入 if commit_id and (item['price'] or item['volume']): # 在redis中判断这个详情页hash: detail-url: 更新时间##数据库保存id是否存在, # 如果存在则判断更新时间是否符合,不过不符合则进行数据插入然后更新redis数据 # 如果不存在则存入redis和数据库 requesturl = item['detail_url'] rediskey = item['rediskey'] # 从 中取出更新时间和保存的id hash_value = self.save_url_r.hget(rediskey, requesturl) if hash_value: hash_value = hash_value.decode() updatetime = hash_value.split('##')[0] save_id = hash_value.split('##')[1] if updatetime == item['updatetime']: pass # 执行插入 else: self.insert_detaildata(spider, commit_id, item, selflog_error, rediskey) # sql_update_detail = "update car_detail set updatetime=%s, price=%s, volume=%s where id=%s".format( # updatetime=item['updatetime'], price=item['price'],volume=item['volume'], save_id=save_id) # sql_insert_detail = "insert " # try: # self.cur.execute(sql_insert_detail, [item['updatetime'], item['price'],item['volume'], save_id]) # # 更新redis的值 # self.save_url_r.hset(rediskey, requesturl, item['updatetime'] + "##" + str(self.cur.lastrowid)) # print("{spidername}--更新爬取价格数据:{updata}".format(spidername=spider.name, updata=item['updatetime'] + "##" + save_id)) # # # selflog.logger.info("更新爬取价格数据:{updata}".format(updata=item['updatetime'] + "##" + save_id)) # except Exception as e: # selflog_error.logger.error("{spidername}更新爬取车价详情出错e:{e}__sql:{sql}".format(spidername=spider.name, e=e, sql=sql_update_detail)) # redis中查不到这条数据,直接写入数据库中, 并在redis中添加 detail_url:更新时间##save_id 数据 else: self.insert_detaildata(spider, commit_id, item, selflog_error, rediskey) # selflog.logger.info("写入爬取价格数据:{updata}".format(updata=item['updatetime'] + "##" + str(self.cur.lastrowid))) # self.save_url_r.sadd(item['rediskey'], requesturl) else: selflog.logger.info( "没有交易价格和交易量数据,car_detial表不进行插入key:%s, --values:%s" % (keys, values)) return item def insert_detaildata(self, spider, commit_id, item, selflog_error, rediskey): table_name = "car_detail" second_key = [ "platform", "vehicleType", "price", "volume", "updatetime", "detail_url" ] second_values = [ item['platform'], commit_id, item['price'], str(item['volume']), item['updatetime'], item['detail_url'] ] # 插入车型平台价格详情表 second_sql = "insert into `{}` ({}) values ({})".format( table_name, ','.join(second_key), ','.join(['%s'] * len(second_values))) try: self.cur.execute(second_sql, second_values) self.conn.commit() cardetail_id = self.cur.lastrowid except Exception as e: selflog_error.logger.info( "{spidername} 插入车价详情error:{e}--style表中的id:{id}--出错的sql:{second_sql}, --values:{second_values}" .format(spidername=spider.name, e=e, id=commit_id, second_sql=second_sql, second_values=second_values)) else: self.save_url_r.hset( rediskey, item['detail_url'], item['updatetime'] + "##" + str(self.cur.lastrowid))
import datetime import json import re import requests import scrapy from fake_useragent import UserAgent from scrapy import Request from cars.CONSTANT import China, USA, Canada, European, Mexico from cars.items import CarStyleItem from cars.log_utils import SelfLog from cars.utils import deal_style, deal_year, deal_displacement, deal_guideprice, sav_item, set_redis, Mysqlpython, \ deal_updatetime cookie_r = set_redis(2) set_url_r = set_redis(4) # type_r = set_redis() dbhelper = Mysqlpython() cookies_chezhen = "cookies_chezhen" unuseless_cookies_chezhen = "unuseless_cookies_chezhen" url_redis = "chezhen" url_redis_chezhen = "chezhen_urls" class ChezhenSpider(scrapy.Spider): name = 'chezhen' selflog = SelfLog(name) cookies = {
def signup_car168(driver): try: # 等待3s看跳过界面是否出现 if WebDriverWait(driver, 2).until(lambda x: x.find_element_by_xpath( "//android.widget.Button[@resource-id='com.zjw.chehang168:id/itemButton']" )): driver.find_element_by_xpath( "//android.widget.Button[@resource-id='com.zjw.chehang168:id/itemButton']" ).click() except Exception as e: print(e) # 接码平台接口拿到的token,手机号 token, phone_nume = build_phonenum(projectid, loop=2, filter='') try: # 输入手机号 if WebDriverWait(driver, 3).until(lambda x: x.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/unameEdit']" )): driver.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/unameEdit']" ).send_keys(phone_nume) # 点击获取验证码按钮 driver.find_element_by_xpath( "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/loginButton']" ).click() except Exception as e: print(e, "输入手机号点击获取验证码出错") try: # 安全验证 if WebDriverWait(driver, 15).until(lambda x: x.find_element_by_xpath( "//android.view.View[@resource-id='nc_1_n1t']")): inter = driver.find_element_by_xpath( "//android.view.View[@resource-id='nc_1_n1t']") wrapper = driver.find_element_by_xpath( "//android.view.View[@text='请向右滑动验证']") print(inter.location, inter.size) start = [ inter.location['x'] + inter.size["width"] // 2, inter.location['y'] + inter.size["height"] // 2 ] end = [ wrapper.location['x'] + wrapper.size["width"] - inter.size["width"] // 2, inter.location['y'] + inter.size["width"] // 2 ] # end = [670, 0] print(start, end) touch_test(driver=driver, start=start, end=end, el=inter) except Exception as e: print(e, "滑动验证模块出错") # 发送验证码接口 print("手机号", phone_nume) code = get_code(projectid=projectid, phonenum=phone_nume, token=token, matchrule=matchrule) # code = "1234" print("验证码:***{code}***".format(code=code)) # relase_phonenum(projectid, phone_nume, token) if not code: print("没有收到验证码,直接退出") driver.quit() return None try: # 输入验证码 if WebDriverWait(driver, 15).until(lambda x: x.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/tv_0']" )): x1 = driver.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/tv_0']" ) x1.click() x1.send_keys(code[0]) x2 = driver.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/tv_1']" ) x2.click() x2.send_keys(code[1]) x3 = driver.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/tv_2']" ) x3.click() x3.send_keys(code[2]) x4 = driver.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/tv_3']" ) x4.click() x4.send_keys(code[3]) except Exception as e: print(e) # 如果有错误提示 try: if WebDriverWait(driver, 1).until(lambda x: x.find_element_by_xpath( "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/btn2']" )): driver.find_element_by_xpath( "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/btn2']" ).click() print("验证码错误") except Exception as e: print("验证通过:", e) # 输入个人信息进行注册 try: if WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/nameEdit']" )): # 真实姓名 driver.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/nameEdit']" ).send_keys(random.choice(name_list)) # 登录密码 driver.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/pwdEdit']" ).send_keys("ma123456") # 工作地点: 点击选择 driver.find_element_by_xpath( "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/areaText']" ).click() choice_driver = driver choice_city(choice_driver) # 填写公司名 try: if WebDriverWait( driver, 5 ).until(lambda x: x.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/companyEdit']" )): driver.find_element_by_xpath( "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/companyEdit']" ).send_keys(random.choice(carnames)) # 选择公司类型 driver.find_element_by_xpath( "//android.widget.RadioButton[@text='其他']").click() # 勾选同意政策 driver.find_element_by_xpath( "//android.widget.ImageView[@resource-id='com.zjw.chehang168:id/itemCheckImg']" ).click() # 完成注册 driver.find_element_by_xpath( "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/submitButton']" ).click() except Exception as e: print("选择城市后没有进行页面跳转", e) except Exception as e: print("选择公司前出错,", e) # 点击我的 try: if WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath( "//android.widget.RadioButton[@resource-id='com.zjw.chehang168:id/radio_button4']" )): driver.find_element_by_xpath( "//android.widget.RadioButton[@resource-id='com.zjw.chehang168:id/radio_button4']" ).click() except Exception as e: print("没有我", e) # 点击设置 try: if WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath( "//android.widget.ImageView[@resource-id='com.zjw.chehang168:id/rightImg']" )): driver.find_element_by_xpath( "//android.widget.ImageView[@resource-id='com.zjw.chehang168:id/rightImg']" ).click() except Exception as e: print("没有设置", e) # 点击退出登录 try: if WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath( "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/tv_content' and @text='退出登录']" )): driver.find_element_by_xpath( "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/tv_content' and @text='退出登录']" ).click() except Exception as e: print("没有退出登录", e) # 点击确认 try: if WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath( "//android.widget.Button[@resource-id='android:id/button1']")): driver.find_element_by_xpath( "//android.widget.Button[@resource-id='android:id/button1']" ).click() except Exception as e: print("没有确认按钮", e) # 操作完成后把手机号存入redis else: result = set_redis(2).sadd("car168_phonenum", phone_nume) print("phone_num{}写入redis中:{}".format(phone_nume, result)) driver.quit()
# mapping = {json.dumps(cookies): int(phone_num)} # print(mapping) # r.zadd("cookies_car168", mapping) # 将cookie值存入redis list 和hash dump_cookies = json.dumps(cookies) r.lpush("cookies_car168_list", dump_cookies) r.hset("cookies_car168", dump_cookies, phone_num) print("成功存入redis") driver.quit() else: print("将无用注册手机号存入redis") set_redis(2).sadd("unuseless_car168_phonenum", phone_num) print("没有接收到验证码") if __name__ == '__main__': while set_redis(2).scard("car168_phonenum"): token, phone_num, phone_num_from_set = get_phone() if not phone_num: print("将无用注册手机号存入redis") set_redis(2).sadd("unuseless_car168_phonenum", phone_num_from_set) continue driver = get_driver() sleep(1) mouseclick() clicklogin(driver, phone_num, projectid, token=token, matchrule=matchrule)
import json import random import re import time import scrapy from fake_useragent import UserAgent from scrapy import Request from cars.CONSTANT import China, ChinaImport, USA, Canada, Mexico, European from cars.items import CarStyleItem from cars.log_utils import SelfLog from cars.utils import Mysqlpython, set_redis, deal_year, deal_style, deal_displacement, deal_updatetime, \ deal_guideprice cookie_r = set_redis(2) type_r = set_redis() dbhelper = Mysqlpython() set_url_r = set_redis(4) cookies_car168 = "cookies_car168" unuseless_cookies_car168 = "unuseless_cookies_car168" url_redis = "car168" url_redis_car168 = "car168_urls" class Car168Spider(scrapy.Spider): name = 'car168' allowed_domains = ['www.chehang168.com'] start_urls = ['http://www.chehang168.com/', 'http://www.chehang168.com/index.php?c=index&m=allBrands', "http://www.chehang168.com/index.php?c=index&m=Cardata"] selflog = SelfLog(name)
url = "http://www.niuniuqiche.com/v2/sell_cars?brand_name=%E5%A5%A5%E8%BF%AA&car_model_name=%E5%A5%A5%E8%BF%AAA3&firm_name=%E4%B8%80%E6%B1%BD-%E5%A4%A7%E4%BC%97%E5%A5%A5%E8%BF%AA" # url = "http://www.chehang168.com/index.php?c=index&m=index" # result = requests.get(url, headers=headers) # tree = etree.HTML(result.text) # print("****"+tree.xpath('//div[@class="section-pagination"]//span[@class="page current"]/text()')[0].strip()+"*****") # print("****"+tree.xpath('//div[@class="section-pagination"]//span[@class="last"]/a/@href')[0].split('page=')[1]+"*****") # for i in r.zscan("test")[1]: # mapping = {} # mapping[i[0]] = i[1] # r.zadd("cookies_car168", mapping) # dictss = { # "CzcyAutoLogin":"******", # } r = set_redis(2) # for i in r.zscan("unuseless_cookies_car168")[1]: # # mapping = {} # # mapping[i[0]] = i[1] # # print(mapping) # phone_num = str(int(i[1])) # print(phone_num) # r.sadd("car168_phonenum", phone_num) if __name__ == '__main__': strs = "奥迪 A8L 17款 6.3TSFI W12 旗" print( re.search( "款\s.*?(\d+\.\d+i|\d+i|\d+\.\d+L[a-z]{0,1}|\d+L[a-z]{0,1}|\d+\.\d+T[A-Z]{0,3}|\d+T[A-Z]{0,3}|\d+ T[A-Z]{0,3})", strs).group(1))
import random import re import scrapy from fake_useragent import UserAgent from scrapy import Request # 账号:17061084088 # 密码:ma123456 from cars.CONSTANT import USA, Canada, Mexico, European, China, ChinaImport from cars.items import CarStyleItem, CarDetailItem from cars.log_utils import SelfLog from cars.settings import BASE_DIR from cars.utils import Mysqlpython, set_redis, deal_style, deal_year, deal_displacement, deal_guideprice r_zet_cookie = set_redis(2) set_url_r = set_redis(4) dbhelper = Mysqlpython() cookies_nnqc = "cookies_nnqc" unuseless_cookies_nnqc = "unuseless_cookies_nnqc" url_redis = "nnqc" url_redis_nnqc = "nnqc_urls" class NnqcSpider(scrapy.Spider): name = 'nnqc' allowed_domains = ['www.niuniuqiche.com'] start_urls = [ 'http://www.niuniuqiche.com/', 'http://www.niuniuqiche.com/v2/brands' ] i = 0
class DealDataMiddleware(object): add_url_r = set_redis(db=1) cookies_deal_r = set_redis(db=2) set_url_r = set_redis(4) # 获取代理 def get_proxy(self): with open(os.path.join(BASE_DIR, "proxies.txt"), "r") as f: date = f.read().splitlines() proxy = random.choice(date) return proxy # # 获取请求头 # def get_headers(self, request, spider): # # sql = "select ua from useragent order by rand() limit 1;" # cookie_dict = self.get_cookies(request, spider) # cookie = "" # for k, v in cookie_dict.items(): # cookie = cookie + k + "=" + v +";" # print("得到的cookie值", cookie) # # headers = { # # "User-Agent": dbhelper.readall(sql)[0][0], # "User-Agent": UserAgent().random, # "Content-Type": "text/html; charset=utf-8", # "Host": "www.niuniuqiche.com", # "Cookie": cookie # } # # return headers def process_request(self, request, spider): selflog = SelfLog(spider.name) # 根据爬虫名字分别处理爬虫代理和cookie if spider.name == "nnqc": request.meta['http_proxy'] = self.get_proxy() print("{}使用的代理为{},请求url:{}".format(spider.name, request.meta['http_proxy'], request.url)) elif spider.name == "chezhen": request.meta['http_proxy'] = self.get_proxy() print("{}使用的代理为{},请求url:{}".format(spider.name, request.meta['http_proxy'], request.url)) elif spider.name == "car168": pass # # 添加cookie request.cookies = self.get_cookies(request, spider) # cookie_dict = self.get_cookies(request, spider) # cookie = "" # for k, v in cookie_dict.items(): # cookie = cookie + k + "=" + v +";" # request.headers["User-Agent"] = UserAgent().random # request.headers["Content-Type"] = "text/html; charset=utf-8" # request.headers["Cookie"] = cookie # url = request.url # redis_key = request.meta['url_redis'] # if self.add_url_r.sismember(redis_key, url): # spider.logger.info("该url已经爬取,舍弃:%s"%url) # raise IgnoreRequest return None def process_response(self, request, response, spider): selferrorlog = SelfLog('error') selfinfolog = SelfLog(spider.name) # 把cookie从能用的库中转移到不能用的库里 cookie_redis_key_hash = request.meta['cookies_redis'] cookie_redis_key = request.meta['cookies_redis'] + "_list" unuse_cookie_redis_key = request.meta['useless_cookies'] if response.status == 302: print(response.text) selferrorlog.logger.error( "{spidername}-被封,302重定向到登录界面{cookie}:".format( spidername=spider.name, cookie=request.cookies)) request = self.dealcookie(request, response, spider) return request elif "c=com&m=limitPage" in response.text: selferrorlog.logger.error( "{spidername}-重定向到限制界面, cookie值:{cookie}".format( spidername=spider.name, cookie=request.cookies)) request = self.dealcookie(request, response, spider) return request elif "请重新登录" in response.text: selferrorlog.logger.error( "{spidername}-cookie:{cookies}过期,或者IP不一致,到登录界面".format( spidername=spider.name, cookies=request.cookies)) request = self.dealcookie(request, response, spider) return request selfinfolog.logger.info("请求url:{url}使用的cookie:{cookie}".format( url=response.url, cookie=request.cookies)) return response # 处理过期或者被封cookie def dealcookie(self, request, response, spider): selflog = SelfLog('error') cookie_redis_key_hash = request.meta['cookies_redis'] cookie_redis_key_list = request.meta['cookies_redis'] + "_list" unuse_cookie_redis_key = request.meta['useless_cookies'] redis_member = json.dumps(request.cookies) # 查到在hash 中的 手机号 zset_phone = self.cookies_deal_r.hget(cookie_redis_key_hash, redis_member) # 移除在list 和 有用hash 中的 数据 # 在不能用的hash中添加 self.cookies_deal_r.lrem(cookie_redis_key_list, 0, redis_member) self.cookies_deal_r.hdel(cookie_redis_key_hash, redis_member) self.cookies_deal_r.hset(unuse_cookie_redis_key, redis_member, zset_phone) # 再从redis中取出一个cookie构建request对象 try: popcookie = self.cookies_deal_r.lpop(cookie_redis_key_list) self.cookies_deal_r.rpush(cookie_redis_key_list, popcookie) except Exception as e: selflog.logger.error("{spidername}--cookie 耗尽请补充, 错误信息:{e}".format( spidername=spider.name, e=e)) # 发送邮件通知,并且最好处理能关闭爬虫 sendEmail(content="{cookname}cookie耗尽,请尽快处理".format( cookname=cookie_redis_key_list)) spider.crawler.engine.close_spider( spider, "{cookname}cookie耗尽,关闭爬虫".format( cookname=cookie_redis_key_list)) else: request.cookies = json.loads(popcookie) return request def get_cookies(self, request, spider): selflog = SelfLog('error') cookie_redis_key_list = request.meta['cookies_redis'] + "_list" cookie_redis_key_hash = request.meta['cookies_redis'] unuse_cookie_redis_key = request.meta['useless_cookies'] try: # cookies_dict = random.choice(self.cookies_deal_r.zscan(cookie_redis_key)[1]) # 把cookie 取出来然后放到队尾 popcookie = self.cookies_deal_r.lpop(cookie_redis_key_list) self.cookies_deal_r.rpush(cookie_redis_key_list, popcookie) except Exception as e: selflog.logger.error( "spidername:{spidername} 的cookie 耗尽请补充, 错误信息:{e}".format( spidername=spider.name, e=e)) # 发送邮件通知,并且最好处理能关闭爬虫 sendEmail(content="{cookname}cookie耗尽,请尽快处理".format( cookname=cookie_redis_key_list)) spider.crawler.engine.close_spider( spider, "{cookname}cookie耗尽,关闭爬虫".format( cookname=cookie_redis_key_list)) else: dicts = json.loads(popcookie) phonenum = self.cookies_deal_r.hget(cookie_redis_key_hash, popcookie) print("{cookie_redis}--手机号:{phonenum}--cookie:{cookie}".format( cookie_redis=cookie_redis_key_hash, phonenum=phonenum, cookie=dicts)) return dicts
import json import os import random import re import time from multiprocessing import Process, Pool import requests from fake_useragent import UserAgent from requests.utils import get_encoding_from_headers from cars.login import signup, get_auth from cars.settings import BASE_DIR from cars.utils import set_redis cookie_r = set_redis(2) username = "******" password = "******" # 牛牛汽车注册id projectid_nnqc = "17883" cookies_nnqc = "cookies_nnqc" name_list = [ "裴玉", "陈英", "赵兵", "9442", "戴国强", "陶洪万", "朱洪纯", "徐亚玲",
import requests from cars.utils import Mysqlpython, set_redis r = set_redis() url = "http://www.chehang168.com/index.php?c=index&m=Cardata" headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36", "Host": "www.chehang168.com", # "Cookie": "DEVICE_ID=77cd83d79707a7fb386f712f2bef8db0; _uab_collina=158894954478649638062081; soucheAnalytics_usertag=RI8C9Ol6w8; U=1769515_8fb8933d8591166ea0b616db963ba427" "Cookie":"DEVICE_ID=77cd83d79707a7fb386f712f2bef8db0; _uab_collina=158894954478649638062081; soucheAnalytics_usertag=RI8C9Ol6w8; U=1769515_8fb8933d8591166ea0b616db963ba427" } # cookeis = { # "DEVICE_ID":"77cd83d79707a7fb386f712f2bef8db0", # "_uab_collina":"158894954478649638062081", # "soucheAnalytics_usertag":"RI8C9Ol6w8", # "U":"1769515_8fb8933d8591166ea0b616db963ba427", # # } dbhelper = Mysqlpython() def set_data_db(): result = requests.get(url=url, headers=headers) dicts = eval(result.text[14:]) for brand_encode, v in dicts.items(): print("编码品牌名", brand_encode) for index,(i,m) in enumerate(v.items()): # 第一层记录了名字 if index == 0: