def __init__(self, user): self.session = requests.session() self.redisUtils = RedisUtils() self.PROXYADDR = PROXYADDR self.bcode = NOT_NEED_BCODE self.status = CRAWL_READY self.desc = "" self.title = user.get("title", "") # print chardet.detect(self.title) self.title = urllib.quote(self.title.encode('utf8')) self.project_district = user.get("project_district") self.project_developer_name_value = user.get("project", "") self.date_filter_min = user.get("date_filter_min","") self.date_filter_max = user.get("date_filter_max","") # print self.title,self.project_developer_name_value,self.project_district,self.date_filter_max,self.date_filter_min # self.title = urllib.quote(self.title.decode(sys.stdin.encoding).encode('utf-8')) # self.project_district = urllib.quote(self.project_district.decode(sys.stdin.encoding).encode('utf8')) # self.project_developer_name_value = urllib.quote(self.project_developer_name_value.decode(sys.stdin.encoding).encode('utf8')) # self.date_filter_min = urllib.quote(self.date_filter_min.decode(sys.stdin.encoding).encode('utf8')) # self.date_filter_max = urllib.quote(self.date_filter_max.decode(sys.stdin.encoding).encode('utf8')) # self.token = user.get("token", "") self.userid = user.get("userid", "") self.LoginUrl = "https://newhouse.cnnbfdc.com" self.result = user.get("result", "") self.GJJInfo = [] self.bild = []
def __init__(self, user): self.session = requests.session() self.redisUtils = RedisUtils() self.PROXYADDR = PROXYADDR self.bcode = NOT_NEED_BCODE self.status = CRAWL_READY self.desc = "" self.fpjy = user.get("fpjy", "") self.fpdm = user.get("fpdm", "") self.fphm = user.get("fphm", "") self.kprq = user.get("kprq", "") self.fpje = user.get("fpje", "") self.token = user.get("token", "") self.userid = user.get("userid", "") self.fpdm_area = self.fpdm[0:4] self.fpdm_url = AREA.get(self.fpdm_area, "") self.suiji = str(int(round(time.time() * 1000))) self.codeUrl = self.fpdm_url + '/WebQuery/yzmQuery?callback=jQuery110204713398352365614_' + self.suiji + '&fpdm=' + self.fpdm + '&r=' + str( '%.16f' % (random.random())) + '&v=V1.0.04_001' + '&nowtime=' + str( int(round(time.time() * 1000)) ) + '&publickey=B8EE27C2CFEABABBD1DB92F4D84E4EA3&_=' + str( int(round(time.time() * 1000))) self.result = user.get("result", "") self.GJJInfo = [] self.PerInfo = {}
def __init__(self, user): self.session = requests.session() self.redisUtils = RedisUtils() self.PROXYADDR = PROXYADDR self.bcode = NOT_NEED_BCODE self.status = CRAWL_READY self.desc = "" self.username = urllib.quote(user.get("name", "")) self.idcard = urllib.quote(user.get("idcard", "")) self.area = urllib.quote(user.get("area", "")) self.token = user.get("token", "") self.userid = user.get("userid", "") # urllib.quote(a) self.LoginUrl = "https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?" + "resource_id=6899&query=%E5%A4%B1%E4%BF%A1%E8%A2%AB%E6%89%A7%E8%A1%8C%E4%BA%BA%E5%90%8D%E5%8D%95&cardNum=" + self.idcard + "&iname=" + self.username + "&areaName=" + self.area + "&ie=utf-8&oe=utf-8&format=json&t=" + str( int(round(time.time() * 1000))) + "&cb=jQuery110207690611877233657_" + str( int(round(time.time() * 1000))) + "&_=" + str( int(round(time.time() * 1000))) self.result = user.get("result", "") self.GJJInfo = [] self.PerInfo = {} self.PayRecord = {}
def __init__(self): """ 初始化 :param spider: :return: """ # 线程数 self.thread_num = THREAD_NUM # 队列数:1 self.thread_q_size = THREAD_Q_SIZE # redis操作类 self.redisUtils = RedisUtils()
def __init__(self, user): self.session = requests.session() self.redisUtils = RedisUtils() self.PROXYADDR = PROXYADDR self.bcode = NOT_NEED_BCODE self.status = CRAWL_READY self.desc = "" self.keyword = user.get("keyword", "") # self.gjjaccnum = self.username if len(self.username) <= 15 else "" # self.pwd = user.get("password", "") self.age = FANGAGE.get(user.get('age',''),'') or FANGAGE.get(user.get('year',''), '') self.token = user.get("token", "") self.flower = LOUCENG.get(user.get('flower',''),'') or LOUCENG.get(user.get('floor',''),'') self.hu_type = HUXING.get(user.get('hu_type',''),'') or HUXING.get(user.get('housetype',''),'') # self.userid = user.get("userid", "") self.startUrl = "http://esf.nb.fang.com/NewSecond/sale_info/searchlist_new2014.aspx" self.hostUrl = "http://esf.nb.fang.com/" self.result = user.get("result", "") self.GJJInfo = []
def threadWork(t): """ 进程工作,用来启动多个线程 :param x: :return: """ redisUtils = RedisUtils() thread_pool = ThreadPool(THREAD_NUM * 10, q_size=THREAD_Q_SIZE) dict_json = redisUtils.getCons() logger.debug("Now have tasks -> " + str(dict_json)) if dict_json != []: try: for i in dict_json: spider_name = re.findall('spider_(.*):task', i)[0] # for i in [i.split(":")[0] for i in dict_json]: requests = makeRequests(imptask, [spider_name]) thread_pool.putRequest(requests[0]) thread_pool.wait() except Exception as e: logger.error(e) else: time.sleep(1)
def __init__(self, dict_json, key=[], verifycode_type='png'): self.redisUtils = RedisUtils() self.damatuWeb = damatuWeb self.PROXYADDR = PROXYADDR self.dict_json = dict_json self.token = self.dict_json['token'] self.verifycode_type = verifycode_type self.status = None self.desc = None self.current_milli_time = lambda: str(int(round(time.time() * 1000))) self.startTime = self.current_milli_time() self.realpath = os.path.split(os.path.realpath(__file__))[0] filename = 'verifycode/%s_verifycode.%s' % (self.startTime, verifycode_type) self.code_path = os.path.join(self.realpath, filename) logging.config.fileConfig('unicom/logging.config') self.logger = logging.getLogger('flow') # 是否保存用户目录,默认为爬虫出错才保存,也可以手动修改 self.rmuserdirFlag = False self.mkUserdir(key)
class SpiderClient(): def __init__(self): """ 初始化 :param spider: :return: """ # 线程数 self.thread_num = THREAD_NUM # 队列数:1 self.thread_q_size = THREAD_Q_SIZE # redis操作类 self.redisUtils = RedisUtils() def progressWork(self): """ 进程工作,用来启动多个线程 :param x: :return: """ thread_pool = ThreadPool(self.thread_num, q_size=self.thread_q_size) for i in range(self.thread_num): dict_t = {} requests = makeRequests(self.threadWork, [dict_t]) thread_pool.putRequest(requests[0]) thread_pool.wait() # def threadWork(self, t): def threadWork(self): """ 线程工作的方法,主要用户获取任务 :param t: :return: """ startTime = datetime.now() logger.info('开始等待获取任务') while True: try: # 从任务队列中获取任务数据 dict_json = self.redisUtils.getCon(SOURCE) if dict_json is not None: dict_json = decryptKwargs(dict_json) name = dict_json['token'] # 将token放入到thread中 logger.info("获取到任务,%s" % name) # sendMail(u"邮件log测试",'*****@*****.**') dict_json.update({"result": []}) self.taskWork(dict_json) break else: finishTime = datetime.now() # abs():返回数字的绝对值 if abs(finishTime.minute - startTime.minute) >= WAITTING: break time.sleep(1) except Exception: s = traceback.format_exc() logger.error(s) @Time() def taskWork(self, dict_json): """ 具体任务的工作方法,主要调用爬虫完成数据爬取 :param dict_json: :return: """ # token = dict_json['token'] try: client = SpiderMain(dict_json) f = client.crawl() logger.info('任务结束:%s' % dict_json['token']) # 抓取成功则删除所保存的样本,如需保存可在爬虫中返回Ture if not f: client.rmUserdir(client.userdir) # logger.info("不需要抓取图片验证码,token:%s" % token) # p1 = threading.Thread(target=client.crawl, args=("user",)) # p2 = threading.Thread(target=client.crawl) # p1.start() # p2.start() except Exception: self.redisUtils.setNotify(token=dict_json['token'], val='2', decs='爬虫爬取失败') s = traceback.format_exc() logger.error(s) logger.info('任务结束:%s' % dict_json['token'])
def __init__(self, user): self.url_area = { "北京": "http://bj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml", "天津": "http://tj.gsxt.gov.cn/index.html", "河北": "http://he.gsxt.gov.cn/notice/", "山西": "http://sx.gsxt.gov.cn/index.jspx", "内蒙古": "http://nm.gsxt.gov.cn:58888/", "辽宁": "http://ln.gsxt.gov.cn/saicpub/", "吉林": "http://jl.gsxt.gov.cn/", "黑龙江": "http://hl.gsxt.gov.cn/index.jspx", "上海": "http://sh.gsxt.gov.cn/notice", "江苏": "http://www.jsgsj.gov.cn:58888/province/", "浙江": "http://zj.gsxt.gov.cn/client/entsearch/toEntSearch", "安徽": "http://ah.gsxt.gov.cn/index.jspx", "福建": "http://fj.gsxt.gov.cn/notice", "江西": "http://jx.gsxt.gov.cn/", "山东": "http://sd.gsxt.gov.cn/", "广东": "http://gd.gsxt.gov.cn/", "广西": "http://gx.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml", "海南": "http://hi.gsxt.gov.cn/index.jspx", "河南": "http://ha.gsxt.gov.cn/index.jspx", "湖北": "http://hb.gsxt.gov.cn/index.jspx", "湖南": "http://hn.gsxt.gov.cn/notice/", "重庆": "http://cq.gsxt.gov.cn/", "四川": "http://sc.gsxt.gov.cn/notice/", "贵州": "http://gz.gsxt.gov.cn/", "云南": "http://yn.gsxt.gov.cn/notice/", "西藏": "http://xz.gsxt.gov.cn/index.jspx", "陕西": "http://sn.gsxt.gov.cn/ztxy.do?method=index&random=", "甘肃": "http://gs.gsxt.gov.cn/gsxygs/", "青海": "http://qh.gsxt.gov.cn/index.jspx", "宁夏": "http://nx.gsxt.gov.cn/", "新疆": "http://xj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml" } self.session = requests.session() self.redisUtils = RedisUtils() self.PROXYADDR = PROXYADDR self.bcode = NOT_NEED_BCODE self.status = CRAWL_READY self.desc = "" self.area = user.get("area", "") self.keyword = user.get("idCard", "") self.token = user.get("token", "") self.LoginUrl = self.url_area.get( self.area, "") if self.area else "http://www.gsxt.gov.cn/index.html" self.result = user.get("result", "") self.GJJInfo = [] self.br = self.get_webdriver("chrome") self.br.dc = DriverClean(1, time.time(), self.br.service.process.pid, self.br) self.br.get1 = MethodType(get1, self.br, webdriver.Chrome) self.br.find_element_by_xpath1 = MethodType(find_element_by_xpath1, self.br, webdriver.Chrome) global globallogger globallogger = self.logger # self.br.maximize_window() # self.br.set_window_size(1300,900) # self.proxy = self._proxy() # proxy=webdriver.Proxy() # proxy.proxy_type=ProxyType.MANUAL # proxy.http_proxy=self.proxy # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 # proxy.add_to_capabilities(self.dcap) # self.br.start_session(self.dcap) # self.br.get('http://httpbin.org/ip') # print self.br.page_source self.wait = WebDriverWait(self.br, 10, 0.5) self.br.set_page_load_timeout(10) self.br.set_script_timeout(15) self.br.implicitly_wait(10)
class SpiderMain(hypc_accumulation_fund): logger = logging.getLogger() def __init__(self, user): self.url_area = { "北京": "http://bj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml", "天津": "http://tj.gsxt.gov.cn/index.html", "河北": "http://he.gsxt.gov.cn/notice/", "山西": "http://sx.gsxt.gov.cn/index.jspx", "内蒙古": "http://nm.gsxt.gov.cn:58888/", "辽宁": "http://ln.gsxt.gov.cn/saicpub/", "吉林": "http://jl.gsxt.gov.cn/", "黑龙江": "http://hl.gsxt.gov.cn/index.jspx", "上海": "http://sh.gsxt.gov.cn/notice", "江苏": "http://www.jsgsj.gov.cn:58888/province/", "浙江": "http://zj.gsxt.gov.cn/client/entsearch/toEntSearch", "安徽": "http://ah.gsxt.gov.cn/index.jspx", "福建": "http://fj.gsxt.gov.cn/notice", "江西": "http://jx.gsxt.gov.cn/", "山东": "http://sd.gsxt.gov.cn/", "广东": "http://gd.gsxt.gov.cn/", "广西": "http://gx.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml", "海南": "http://hi.gsxt.gov.cn/index.jspx", "河南": "http://ha.gsxt.gov.cn/index.jspx", "湖北": "http://hb.gsxt.gov.cn/index.jspx", "湖南": "http://hn.gsxt.gov.cn/notice/", "重庆": "http://cq.gsxt.gov.cn/", "四川": "http://sc.gsxt.gov.cn/notice/", "贵州": "http://gz.gsxt.gov.cn/", "云南": "http://yn.gsxt.gov.cn/notice/", "西藏": "http://xz.gsxt.gov.cn/index.jspx", "陕西": "http://sn.gsxt.gov.cn/ztxy.do?method=index&random=", "甘肃": "http://gs.gsxt.gov.cn/gsxygs/", "青海": "http://qh.gsxt.gov.cn/index.jspx", "宁夏": "http://nx.gsxt.gov.cn/", "新疆": "http://xj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml" } self.session = requests.session() self.redisUtils = RedisUtils() self.PROXYADDR = PROXYADDR self.bcode = NOT_NEED_BCODE self.status = CRAWL_READY self.desc = "" self.area = user.get("area", "") self.keyword = user.get("idCard", "") self.token = user.get("token", "") self.LoginUrl = self.url_area.get( self.area, "") if self.area else "http://www.gsxt.gov.cn/index.html" self.result = user.get("result", "") self.GJJInfo = [] self.br = self.get_webdriver("chrome") self.br.dc = DriverClean(1, time.time(), self.br.service.process.pid, self.br) self.br.get1 = MethodType(get1, self.br, webdriver.Chrome) self.br.find_element_by_xpath1 = MethodType(find_element_by_xpath1, self.br, webdriver.Chrome) global globallogger globallogger = self.logger # self.br.maximize_window() # self.br.set_window_size(1300,900) # self.proxy = self._proxy() # proxy=webdriver.Proxy() # proxy.proxy_type=ProxyType.MANUAL # proxy.http_proxy=self.proxy # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 # proxy.add_to_capabilities(self.dcap) # self.br.start_session(self.dcap) # self.br.get('http://httpbin.org/ip') # print self.br.page_source self.wait = WebDriverWait(self.br, 10, 0.5) self.br.set_page_load_timeout(10) self.br.set_script_timeout(15) self.br.implicitly_wait(10) # 加入当次代理 # self.proxy = self._proxy() def _proxy(self): proxy = self.session.get(self.PROXYADDR).content # return {"http": "http://" + proxy, "https": "http://" + proxy} return proxy def run(self, keyword): content = self.hack_geetest(keyword.decode('utf8')) if not content[0].get("base_info", "") and not content[0].get( "admin_penalty_info", "" ) and not content[0].get( "operate_abnormal_info", "" ) and not content[0].get("key_person_info", "") and not content[0].get( "change_info", "") and not content[0].get( "check_info", "") and not content[0].get( "chattel_info", "") and not content[0].get( "branch_info", "") and not content[0].get( "equity_pledged_info", "") and not content[0].get( "Shareholder_info", "") and not content[0].get( "judicial_assist_info", "") and not content[0].get( "knowledge_info", "") and not content[0].get( "brand_info", "") and not content[0].get( "annual_shareholder_info", "") and not content[0].get( "annual_info", ""): self.status = PASSWORD_IS_NULL self.quit_webdriver() return content def wait_for(self, by1, by2): self.br.dc.setts(time.time()) self.br.dc.setstatus(0) return self.wait.until(EC.presence_of_element_located((by1, by2))) def input_params(self, name): self.logger.info('正在打开官网URL') try: self.br.get1(self.LoginUrl) # self.br.refresh() except Exception as e: self.logger.error(e) try: # 防止再次刷新的时候出现卡死,所以没用热刷新的方式 self.br.get1(self.br.current_url) except Exception as f: self.logger.error(f) self.logger.info('已经进入官网') ui.WebDriverWait(self.br, 10).until( EC.visibility_of_element_located( (By.XPATH, '//*[@id="btn_query"]'))) element = self.wait_for(By.ID, "keyword") element.send_keys(name) element = self.wait_for(By.ID, "btn_query") element.click() self.status = CRAWL_SUCCESS def _save_captcha(self, codeurl): """ 下载验证码,返回图片b64编码, """ self.logger.info("刷新验证码") try: codeContent = self.session.get(codeurl, headers=IMGHEADERS).content self.logger.debug("验证码二进制内容:{0}".format(codeContent)[:50]) self.logger.info("下载验证码") self.status = NEED_BCODE with open( os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/"), 'wb') as f: f.write(codeContent) self.logger.info("验证码图片已保存!") bcode = base64.b64encode(codeContent) self.status = CRAWL_SUCCESS return bcode except: s = traceback.format_exc() self.logger.error("刷新验证码错误:%s" % s) self.status, self.desc = BCODE_IS_NULL, BCODE_IS_NULL_DESC # return {"error": "超时或代码异常"} def quit_webdriver(self): self.br.quit() self.br.dc.setterm(1) def get_webdriver(self, name): '''选择爬取的方式''' try: if name.lower() == "phantomjs": self.dcap = dict(DesiredCapabilities.PHANTOMJS) self.dcap[ "phantomjs.page.customHeaders.User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36" self.status = CRAWL_SUCCESS return webdriver.PhantomJS(desired_capabilities=self.dcap) elif name.lower() == "chrome": display = Display(visible=0, size=(1920, 1080)) display.start() self.status = CRAWL_SUCCESS # return webdriver.Chrome("/usr/local/bin/chromedriver") return webdriver.Chrome() elif name.lower() == "firefox": display = Display(visible=0, size=(1920, 1080)) display.start() self.status = CRAWL_SUCCESS return webdriver.Firefox() except Exception as e: self.logger.error('运行无头浏览器错误') self.status, self.decs = PASSWORD_IS_NULL, PASSWORD_IS_NULL_DESC def get_bcode(self): '''获取从redis中获取的坐标值''' self.startTime = datetime.now() while True: self.logger.info(u'等待用户传入坐标') inputValue = self.redisUtils.getNotify(self.token, 'bcode') if inputValue: coordinate = self.prase_bcode(inputValue) self.redisUtils.DelNotify(self.token, "bcode") return coordinate else: self.finishTime = datetime.now() tail_time = self.finishTime - self.startTime if tail_time.total_seconds() > 120: self.logger.info('接收用输入超时:%s' % self.token) self.desc = '接收用户输入超时' time.sleep(1) break time.sleep(1) def prase_bcode(self, zuobiao): '''坐标处理''' try: zuobiao_list1 = zuobiao.split('_') zuobiao_list = zuobiao_list1[1:] len_zuobiao = len(zuobiao_list) / 2 coordinate = [] for num in range(len_zuobiao): list_n = [] list_n.append(zuobiao_list[2 * num]) list_n.append(zuobiao_list[2 * num + 1]) coordinate.append(list_n) self.status = CRAWL_SUCCESS return coordinate except Exception as e: self.logger.error('处理坐标异常: %s' % e) self.status = IDCARD_ERROR return '' def element_click(self, coor): '''模拟依次点击''' try: if coor and self.br.find_element_by_class_name("geetest_item_img"): for i in range(len(coor)): element = self.br.find_element_by_class_name( "geetest_item_img") ActionChains(self.br).move_to_element_with_offset( to_element=element, xoffset=int(coor[i][0]) - 7, yoffset=int(coor[i][1]) - 5).perform() ActionChains(self.br).click().perform() time.sleep(0.8) element_cli = self.wait_for(By.CLASS_NAME, "geetest_commit_tip") element_cli.click() time.sleep(0.5) element = self.wait_for(By.CLASS_NAME, "geetest_result_tip") ans = element.text.encode("utf-8") self.status = CRAWL_SUCCESS return ans else: self.logger.info('暂无图片点击') return '失败' except Exception as e: self.logger.error('破解验证码失败') self.status = PASSWORD_ERROR def click_pic(self, info): '''获取图片,将图片的base64发送给服务或第三方来处理''' self.logger.info('需要用户提供点击操作') try: ima_url = ifNotEmptyGetIndex( info.xpath("//*[@class='geetest_item_img']/@src")) card_base = self._save_captcha(ima_url) Data = { 'token': self.token, 'img_base64': card_base, 'spider_name': 'gsxt', 'userid': 'yinzhouyinhang' } content = self.session.post(url='http://127.0.0.1:8000/img', data=Data).content content = eval(content) url_imag = content.get("result", "") self.decs = '请输入url,请求图片并点击' redis_dict = { "image_url": url_imag, "image_base64": "data:image/jpg;base64," + card_base, "token": self.token } self.redisUtils.setNotify(token=self.token, val="1", decs=self.decs, result=redis_dict) self.br.dc.setstatus(1) self.br.dc.setts(time.time()) self.logger.info("begin wait bcode....") co_ordinate = self.get_bcode() self.logger.info("begin set status....") self.br.dc.setstatus(0) self.br.dc.setts(time.time()) ans = self.element_click(co_ordinate) return ans except Exception as e: self.logger.error(e) self.status = CRAWL_SUCCESS def hack_geetest(self, company='大连火眼征信管理有限公司北京'): '''爬取的流程''' try: self.input_params(company) for i in range(10): self.logger.info(u'开始判断验证码的类型') time.sleep(1) info = etree.HTML(str(self.br.page_source)) if info.xpath("//*[@class='geetest_item_img']/@src"): for j in range(3): info2 = etree.HTML(str(self.br.page_source)) ans = self.click_pic(info2) self.logger.info('破解验证码结果: %s' % ans) if '成功' in ans: ui.WebDriverWait(self.br, 10).until( EC.visibility_of_element_located( (By.XPATH, '//*[@class="search_result_span1"]'))) self.status = CRAWL_SUCCESS return country1(self.br).data() elif '失败' in ans and j < 2: time.sleep(3) self.logger.info('图片验证点击失败') self.status = BCODE_ERROR elif '失败' in ans and j == 2: self.logger.info('最后一次图片验证点击失败') self.status = BCODE_ERROR return else: self.logger.info('图片验证点击失败,正在重新请求') self.status = BCODE_ERROR elif info.xpath( "//*[@class='geetest_slider_track']/div/text()"): self.logger.info('系统正在重试') self.input_params(company) else: time.sleep(2) info = etree.HTML(str(self.br.page_source)) if info.xpath("//*[@class='ads-right']/div[1]/div/text()"): self.status = CRAWL_SUCCESS return country1(self.br).data() else: self.logger.info('尝试再次访问') time.sleep(1) self.input_params(company) except Exception as e: self.logger.error('智能检测程序错误%s' % e) self.status = CRAWL_FAIL # return e def login(self, flag): if self.area == '陕西': millis = int(round(time.time() * 1000)) self.LoginUrl = self.LoginUrl + str(millis) try: content = self.run(self.keyword) return content except Exception as e: self.status = CRAWL_FAIL self.logger.error('抓取错误:%s' % e) @Time() def crawl(self, flag=""): CurTime = datetime.now().strftime("%Y-%m-%d") PastTime = (datetime.now() - timedelta(days=729)).strftime("%Y-%m-%d") try: content = self.login(flag) self.GJJInfo.append(content) self.result.append(self.GJJInfo[0]) except: s = traceback.format_exc() self.logger.error("抓取错误:%s" % s) self.status, self.desc = self.status, PROGRAM_ERROR_DESC finally: try: if len(self.result) == 1 and self.status == CRAWL_SUCCESS: self.desc = CRAWL_SUCCESS_DESC result_json = json.dumps(self.result[0], ensure_ascii=False) print result_json self.redisUtils.setNotify(type=TYPEVALUE, token=self.token, val="1", decs="抓取成功!", result=result_json) elif self.status == CRAWL_FAIL: self.desc = CRAWL_FAIL_DESC elif self.status == CRAWL_TIMEOUT: self.desc = CRAWL_TIMEOUT_DESC elif self.status == IDCARD_ERROR: self.desc = IDCARD_ERROR_DESC elif self.status == PASSWORD_ERROR: self.desc = PASSWORD_ERROR_DESC elif self.status == BCODE_ERROR: self.desc = BCODE_ERROR_DESC elif self.status == PASSWORD_IS_NULL: self.desc = PASSWORD_IS_NULL_DESC else: self.desc = PROGRAM_ERROR_DESC except Exception as e: s = traceback.format_exc() self.logger.error(s) finally: try: self.redisUtils.setNotify(type=TYPEVALUE, token=self.token, val=self.status, decs=self.desc) except Exception: s = traceback.format_exc() self.logger.error(s) def zipToStr(self, content): ''' 使用urllib2获取到的内容被压缩,需要进行解压缩 :param content: 需要解压的内容 :return: ''' try: conn = zlib.decompress(content, 16 + zlib.MAX_WBITS) return conn except: self.logger.error('解压缩响应内容出错%s' % traceback.format_exc()) raise Exception("解压缩响应内容出错%s" % traceback.format_exc())
class SpiderClient(): def __init__(self): """ 初始化 :param spider: :return: """ # 线程数 self.thread_num = THREAD_NUM # 队列数:1 self.thread_q_size = THREAD_Q_SIZE # redis操作类 self.redisUtils = RedisUtils() def progressWork(self): """ 进程工作,用来启动多个线程 :param x: :return: """ thread_pool = ThreadPool(self.thread_num, q_size=self.thread_q_size) for i in range(self.thread_num): dict_t = {} requests = makeRequests(self.threadWork, [dict_t]) thread_pool.putRequest(requests[0]) thread_pool.wait() def threadWork(self, t): """ 线程工作的方法,主要用户获取任务 :param t: :return: """ startTime = datetime.now() while True: try: # 从任务队列中获取任务数据 dict_json = self.redisUtils.getCon(SOURCE) # print dict_json,'1234' if dict_json is not None: dict_json = decryptKwargs(dict_json) name = dict_json.get('keyword','') # 将token放入到thread中 logger.info("获取到任务,%s" % name) # sendMail(u"邮件log测试",'*****@*****.**') dict_json.update({"result": []}) self.taskWork(dict_json) else: finishTime = datetime.now() if abs(finishTime.minute - startTime.minute) >= WAITTING: break time.sleep(1) except Exception: s = traceback.format_exc() logger.error(s) @Time() def taskWork(self, dict_json): """ 具体任务的工作方法,主要调用爬虫完成数据爬取 :param dict_json: :return: """ token = dict_json['token'] try: client = SpiderMain(dict_json) logger.info("不需要抓取图片验证码,token:%s" % token) p1 = threading.Thread(target=client.crawl, args=("user",)) # p2 = threading.Thread(target=client.crawl, args=("auto",)) p1.start() # p2.start() except Exception: s = traceback.format_exc() logger.error(s)
class SpiderMain(hypc_translate): logger = logging.getLogger() def __init__(self, user): self.session = requests.session() self.redisUtils = RedisUtils() self.PROXYADDR = PROXYADDR self.bcode = NOT_NEED_BCODE self.status = CRAWL_READY self.desc = "" self.title = user.get("title", "") # print chardet.detect(self.title) self.title = urllib.quote(self.title.encode('utf8')) self.project_district = user.get("project_district") self.project_developer_name_value = user.get("project", "") self.date_filter_min = user.get("date_filter_min","") self.date_filter_max = user.get("date_filter_max","") # print self.title,self.project_developer_name_value,self.project_district,self.date_filter_max,self.date_filter_min # self.title = urllib.quote(self.title.decode(sys.stdin.encoding).encode('utf-8')) # self.project_district = urllib.quote(self.project_district.decode(sys.stdin.encoding).encode('utf8')) # self.project_developer_name_value = urllib.quote(self.project_developer_name_value.decode(sys.stdin.encoding).encode('utf8')) # self.date_filter_min = urllib.quote(self.date_filter_min.decode(sys.stdin.encoding).encode('utf8')) # self.date_filter_max = urllib.quote(self.date_filter_max.decode(sys.stdin.encoding).encode('utf8')) # self.token = user.get("token", "") self.userid = user.get("userid", "") self.LoginUrl = "https://newhouse.cnnbfdc.com" self.result = user.get("result", "") self.GJJInfo = [] self.bild = [] # 加入当次代理 # self.proxy = self._proxy() def ifNotEmptyGetIndex(self, somelist, index=0): """check to see it's not empty""" if somelist: return somelist[index] else: return '' def _proxy(self): proxy = self.session.get(self.PROXYADDR).content return {"http": "http://" + proxy, "https": "http://" + proxy} def _errhtmlRecord(self, content): ''' 错误页面保存 ''' self.logger.info("保存错页内容") try: filename = str(uuid.uuid1()) + ".html" sampleDir = os.path.join(os.path.dirname(__file__), "errorHtml").replace("\\", "/") os.path.exists(sampleDir) or os.mkdir(sampleDir) with open("%s/%s" % (sampleDir, filename), 'w') as f: f.write(str(content)) self.logger.debug("已保存错页内容到{0}".format(filename)) except Exception: self.status = PROGRAM_ERROR s = traceback.format_exc() self.logger.info("保存错页出错") self.logger.warn("{0}".format(s)) def _sampleRecord(self, filename, content): ''' 保存网页内容 ''' self.logger.info("保存网页内容") try: sampleDir = os.path.join(os.path.dirname(__file__), "sample/").replace("\\", "/") os.path.exists(sampleDir) or os.mkdir(sampleDir) with open("%s/%s" % (sampleDir, filename), 'w') as f: f.write(content) self.logger.debug("已保存网页内容到{0}".format(sampleDir)) except Exception: self.status = PROGRAM_ERROR s = traceback.format_exc() self.logger.info("保存网页出错") self.logger.warn("{0}".format(s)) def _fetchUrl(self, url, data=None, header=None, timeout=TIMEOUT, fileName=None, proxy=None): ''' 抓取方法 ''' self.logger.info("开始抓取 {0}".format(url)) if header: headers = header self.logger.debug("伪装头:{0}".format(headers)) else: headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"} self.logger.debug("伪装头:{0}".format(headers)) for ir in range(REQUEST_RETRY): try: self.logger.debug("第{0}次 抓取".format(ir)) if data: if proxy: content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy) self.logger.debug("POST url:{0}, data:{1}, proxy: {2}".format(url, data, proxy)) else: content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False) self.logger.debug("POST url:{0}, data:{1}".format(url, data)) else: if proxy: content = self.session.get(url, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy) self.logger.debug("POST url:{0}, proxy: {1}".format(url, proxy)) else: content = self.session.get(url, headers=headers, timeout=timeout, allow_redirects=False) # print content.encoding self.logger.debug("Get url:{0}".format(url)) if fileName and SAMPLEFLAG: self._sampleRecord(fileName, content.content) return content except: self.logger.error(traceback.format_exc()) self.logger.error("request url {0} failed ,check pls".format(url)) self.status = CRAWL_TIMEOUT raise Exception("Failed to load url (%s)" % url) def login(self,flag): first_url = "https://newhouse.cnnbfdc.com/publicity/project-licenses?title="+str(self.title)+"&project_district="+str(self.project_district)+"&project_developer_name_value="+str(self.project_developer_name_value)+"&date_filter%5Bmin%5D%5Bdate%5D="+str(self.date_filter_min)+"&date_filter%5Bmax%5D%5Bdate%5D="+str(self.date_filter_min) content = self._fetchUrl(url=first_url, header=LOGINHEADERS, fileName="login.html") return str(content.text) @Time() def crawl(self, flag=""): CurTime = datetime.now().strftime("%Y-%m-%d") PastTime = (datetime.now() - timedelta(days=729)).strftime("%Y-%m-%d") try: # login content = self.login(flag) url_num = re.compile(r'/project_license_view/([0-9]+)">') url_num_list = url_num.findall(content) if len(url_num_list) > 0: for url_num in url_num_list: url_detail = self.LoginUrl + '/project_license_view/' + str(url_num) self.logger.info("可查询到您查的信息:%s" % self.title) # 项目详情信息 content = self._fetchUrl(url=url_detail, header=PERHEADERS, fileName="person.html") detail = etree.HTML(content.text) project_name = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='project-detail__info']/div[1]/h1/text()")) # 项目名称 alias_name = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='project-detail__info']/div[1]/div/text()")) # 别名 positioning = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='project-detail__info']/div[2]//span/text()"))#定位 company_name = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='project-detail__info']/div[3]//span/text()")) # 公司名称 project_id = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='project-detail__info']/div[4]/span/text()")) #项目编号 counts = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-entity-view pane-node']/div[2]/div[2]/div/div[1]/div/text()")) or self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-entity-view pane-node']/div[2]/div/div[2]/div[1]/div/strong/text()")) # 套数 area = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-entity-view pane-node']/div[2]/div[2]/div/div[2]/div/text()")) or self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-entity-view pane-node']/div[2]/div/div[2]/div[2]/div/strong/text()"))# 面积 #数据汇总 marketable_area = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-first']/div/div[1]/div/text()"))# 可销售面积 sales_area = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-first']/div/div[2]/div/text()"))#已销售面积 has_sold_area = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-first']/div/div[3]/div/text()")) #已销售非住宅面积 number_sellable_households = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-last']/div/div[1]/div/text()")) # 可售户数 has_sold_number = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-last']/div/div[2]/div/text()")) # 已销售户数 has_sold_households = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-last']/div/div[3]/div/text()")) # 已销售非住宅户数 # 详细参数 permit_number = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[1]/td[2]/text()"))# 许可证号 permission_date = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[1]/td[4]/span/text()"))# 许可日期 sales_address = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[2]/td[2]/text()"))# 售楼地址 sales_call = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[2]/td[4]/text()"))# 售楼电话 number_buildings = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[3]/td[2]/text()"))# 幢数 construction_area = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[3]/td[4]/text()"))# 建筑面积 opening_time = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[4]/td[2]/span/text()"))# 开盘时间 supervision_account = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[4]/td[4]/text()"))# 资金监管账户 document_authority = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[5]/td[2]/text()"))# 证件发布机构 financial_bank = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[5]/td[4]/text()"))# 资金监管银行 # 楼栋信息 loudong_list = detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div") if len(loudong_list) > 0: for i in range(len(loudong_list)): i = i + 1 num_floors = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[1]/div/text()")) # 楼号 total_floors = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[2]/div/text()")) # 总层数 # total_houses = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[3]/div/text()")) # 总户数 total_houses = '' permitted_households = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[3]/div/text()")) # 许可户数 has_sold_number_households = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[4]/div/text()")) # 已销售户数 has_sold_residential_households = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[5]/div/text()")) # 已销售非住宅户数 wangqian_list = [] wangqian = str(self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[6]/span/a/@href"))) # 网签的URL if wangqian: lou_numb = re.compile(r'buildingId=([0-9]+)') wang_numb = self.ifNotEmptyGetIndex(lou_numb.findall(wangqian)) wangqian_url = "https://newhouse.cnnbfdc.com//map-api-v1/building_units?args[]=" + wang_numb wang_content = self._fetchUrl(url=wangqian_url, header=IMGHEADERS, fileName="wangqian.html") wangqian_info = etree.HTML(str(wang_content.text)) for j in range(len(wangqian_info.xpath("//result/item"))): j = j + 1 if self.ifNotEmptyGetIndex(wangqian_info.xpath("//result/item["+str(j)+"]/state/text()")) == '3': number = self.ifNotEmptyGetIndex(wangqian_info.xpath("//result/item["+str(j)+"]/number/text()")) wangqian_list.append(number) self.bild.append(hypc_translate.detail_building( num_floors = num_floors, total_floors = total_floors, total_houses = total_houses, permitted_households = permitted_households, has_sold_number_households = has_sold_number_households, has_sold_residential_households = has_sold_residential_households, wangqian_nubm=wangqian_list )) else: self.bild.append(hypc_translate.detail_building( num_floors = '', total_floors = '', total_houses = '', permitted_households = '', has_sold_number_households = '', has_sold_residential_households = '', )) self.GJJInfo.append(hypc_translate.baseinfo( project_name = project_name, alias_name = alias_name, # 别名 positioning = positioning, #定位 company_name = company_name, # 公司名称 project_id = project_id, #项目编号 counts = counts, # 套数 area = area, # 面积 marketable_area = marketable_area, # 可销售面积 sales_area = sales_area, #已销售面积 has_sold_area = has_sold_area, #已销售非住宅面积 number_sellable_households = number_sellable_households, # 可售户数 has_sold_number = has_sold_number, # 已销售户数 has_sold_households = has_sold_households, # 已销售非住宅户数 permit_number = permit_number, # 许可证号 permission_date = permission_date, # 许可日期 sales_address = sales_address, # 售楼地址 sales_call = sales_call, # 售楼电话 number_buildings = number_buildings, # 幢数 construction_area = construction_area, # 建筑面积 opening_time = opening_time, # 开盘时间 supervision_account = supervision_account, # 资金监管账户 document_authority = document_authority, # 证件发布机构 financial_bank = financial_bank, # 资金监管银行 bulding = self.bild )) self.logger.info("解析完成") self.status= CRAWL_SUCCESS self.result.append(self.GJJInfo) else: self.logger.info("暂无您查询的信息:%s" % IDCARD_ERROR_DESC) self.status= CRAWL_SUCCESS self.result.append(self.GJJInfo) except: s = traceback.format_exc() self.logger.error("抓取错误:%s" % s) self.status, self.desc = CRAWL_FAIL, PROGRAM_ERROR_DESC finally: try: if len(self.result) == 1 and self.status == CRAWL_SUCCESS: self.desc = CRAWL_SUCCESS_DESC # print self.result result_json = json.dumps(self.result[0], ensure_ascii=False) # print result_json self.redisUtils.setNotify(type=TYPEVALUE,token=self.token, val="1", decs="抓取成功!", result=result_json) # self.push_data(TYPEVALUE, self.userid, result_json) elif self.status == CRAWL_FAIL: self.desc = CRAWL_FAIL_DESC elif self.status == CRAWL_TIMEOUT: self.desc = CRAWL_TIMEOUT_DESC elif self.status == IDCARD_ERROR: self.desc = IDCARD_ERROR_DESC elif self.status == PASSWORD_ERROR: self.desc = PASSWORD_ERROR_DESC elif self.status == BCODE_ERROR: self.desc = BCODE_ERROR_DESC else: self.desc = PROGRAM_ERROR_DESC except Exception as e: s = traceback.format_exc() self.logger.error(s) finally: try: self.redisUtils.setNotify(type=TYPEVALUE, token=self.token, val=self.status, decs=self.desc) except Exception: s = traceback.format_exc() self.logger.error(s) def zipToStr(self, content): ''' 使用urllib2获取到的内容被压缩,需要进行解压缩 :param content: 需要解压的内容 :return: ''' try: conn = zlib.decompress(content, 16 + zlib.MAX_WBITS) return conn except: self.logger.error('解压缩响应内容出错%s' % traceback.format_exc()) raise Exception("解压缩响应内容出错%s" % traceback.format_exc())
class CrawlBase(object): def __init__(self, dict_json, key=[], verifycode_type='png'): self.redisUtils = RedisUtils() self.damatuWeb = damatuWeb self.PROXYADDR = PROXYADDR self.dict_json = dict_json self.token = self.dict_json['token'] self.verifycode_type = verifycode_type self.status = None self.desc = None self.current_milli_time = lambda: str(int(round(time.time() * 1000))) self.startTime = self.current_milli_time() self.realpath = os.path.split(os.path.realpath(__file__))[0] filename = 'verifycode/%s_verifycode.%s' % (self.startTime, verifycode_type) self.code_path = os.path.join(self.realpath, filename) logging.config.fileConfig('unicom/logging.config') self.logger = logging.getLogger('flow') # 是否保存用户目录,默认为爬虫出错才保存,也可以手动修改 self.rmuserdirFlag = False self.mkUserdir(key) ''' 用户信息以及爬取页面记录,爬取成功会删除,失败则会保存下来便于排错 ''' def mkUserdir(self, key=[]): # 创建用户文件夹 fn = os.path.join(self.realpath, 'sample', '%s_%s' % (self.startTime, self.dict_json['token'])) os.mkdir(fn) # 覆盖敏感信息 info_dict = copy.deepcopy(self.dict_json) for k in key: info_dict[k] = u'******' with open(os.path.join(fn, 'dict_json.txt'), 'w') as f: f.write(str(info_dict)) self.userdir = fn return def rmUserdir(self, fn): shutil.rmtree(fn) def takePage(self, n, content, msg=None): fn = os.path.join(self.userdir, n) with open(fn, 'w') as f: f.write(content) if msg: with open(fn, 'a') as f: f.write('\n' * 5 + '#' * 60 + '\n' * 3 + msg) ''' 验证码交互相关 ''' def get_verifycode(self, codeUrl=None): # 获取图片验证码,并发送通知,如果是短信验证码,或者必须使用driver,可重写此函数 if callable(codeUrl): codeUrl = codeUrl() codeContent = self.session.get(codeUrl).content bcode = base64.b64encode(codeContent) self.redisUtils.setNotify(token=self.token, val=NEED_MORE, decs='需要图片验证码', result='data:image/jpg;base64,' + bcode) self.logger.info('验证码已发送') def judge_verifycode(self, inputValue, ResetCode): # verifycode_handler会根据此函数的返回值判断验证码对错 # 判断验证码对错,爬虫自定义,如果验证码不是错误的,此函数的返回值,将被verifycode_handler()返回 pass def get_input(self): # 获取用户输入 stime = datetime.now() self.logger.info('等待用户输入') while True: inputValue = self.redisUtils.getNotify(self.token, 'bcode') if inputValue: return inputValue else: eclipseTimes = datetime.now() - stime if eclipseTimes.total_seconds() > WAITTIME: self.logger.info('接收用输入超时:%s' % self.token) self.status = INPUT_ERROR self.desc = '接收用输入超时' time.sleep(1) return def verifycode_handler(self, codeUrl=None, ResetCode=False): # 交互流程,接受用户响应,可以选择刷新验证码,或者输入验证码 # ResetCode 可控制是否保持会话,开启验证码刷新功能,默认不保持会话 self.logger.info('需要验证码') self.get_verifycode(codeUrl) while True: inputValue = self.get_input() if inputValue == 'reset': if ResetCode: self.logger.info('用户刷新验证码') self.redisUtils.DelNotify(self.token, 'bcode') self.get_verifycode(codeUrl) continue elif inputValue == None: return else: # 验证码输入是否正确 result = self.judge_verifycode(inputValue, ResetCode) if result: return result else: if ResetCode: self.redisUtils.DelNotify(self.token, 'bcode') self.redisUtils.setNotify(token=self.token, val=INPUT_ERROR, decs='验证码错误') self.logger.info('验证码错误') continue self.status = INPUT_ERROR self.desc = '验证码错误' return
class SpiderMain(craw_dishonest): logger = logging.getLogger() def __init__(self, user): self.session = requests.session() self.redisUtils = RedisUtils() self.PROXYADDR = PROXYADDR self.bcode = NOT_NEED_BCODE self.status = CRAWL_READY self.desc = "" self.username = urllib.quote(user.get("name", "")) self.idcard = urllib.quote(user.get("idcard", "")) self.area = urllib.quote(user.get("area", "")) self.token = user.get("token", "") self.userid = user.get("userid", "") # urllib.quote(a) self.LoginUrl = "https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?" + "resource_id=6899&query=%E5%A4%B1%E4%BF%A1%E8%A2%AB%E6%89%A7%E8%A1%8C%E4%BA%BA%E5%90%8D%E5%8D%95&cardNum=" + self.idcard + "&iname=" + self.username + "&areaName=" + self.area + "&ie=utf-8&oe=utf-8&format=json&t=" + str( int(round(time.time() * 1000))) + "&cb=jQuery110207690611877233657_" + str( int(round(time.time() * 1000))) + "&_=" + str( int(round(time.time() * 1000))) self.result = user.get("result", "") self.GJJInfo = [] self.PerInfo = {} self.PayRecord = {} # 加入当次代理 # self.proxy = self._proxy() def _proxy(self): proxy = self.session.get(self.PROXYADDR).content return {"http": "http://" + proxy, "https": "http://" + proxy} def _errhtmlRecord(self, content): ''' 错误页面保存 ''' self.logger.info("保存错页内容") try: filename = str(uuid.uuid1()) + ".html" sampleDir = os.path.join(os.path.dirname(__file__), "errorHtml").replace("\\", "/") os.path.exists(sampleDir) or os.mkdir(sampleDir) with open("%s/%s" % (sampleDir, filename), 'w') as f: f.write(content) self.logger.debug("已保存错页内容到{0}".format(filename)) except Exception: self.status = PROGRAM_ERROR s = traceback.format_exc() self.logger.info("保存错页出错") self.logger.warn("{0}".format(s)) def _sampleRecord(self, filename, content): ''' 保存网页内容 ''' self.logger.info("保存网页内容") try: sampleDir = os.path.join(os.path.dirname(__file__), "sample/").replace("\\", "/") os.path.exists(sampleDir) or os.mkdir(sampleDir) with open("%s/%s" % (sampleDir, filename), 'w') as f: f.write(content) self.logger.debug("已保存网页内容到{0}".format(sampleDir)) except Exception: self.status = PROGRAM_ERROR s = traceback.format_exc() self.logger.info("保存网页出错") self.logger.warn("{0}".format(s)) def _fetchUrl(self, url, data=None, header=None, timeout=TIMEOUT, fileName=None, proxy=None): ''' 抓取方法 ''' self.logger.info("开始抓取 {0}".format(url)) if header: headers = header self.logger.debug("伪装头:{0}".format(headers)) else: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0" } self.logger.debug("伪装头:{0}".format(headers)) for ir in range(REQUEST_RETRY): try: self.logger.debug("第{0}次 抓取".format(ir)) if data: if proxy: content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy) self.logger.debug( "POST url:{0}, data:{1}, proxy: {2}".format( url, data, proxy)) else: content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False) self.logger.debug("POST url:{0}, data:{1}".format( url, data)) else: if proxy: content = self.session.get(url, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy) self.logger.debug("Get url:{0}, proxy: {1}".format( url, proxy)) else: content = self.session.get(url, headers=headers, timeout=timeout, allow_redirects=False) self.logger.debug("Get url:{0}".format(url)) if fileName and SAMPLEFLAG: self._sampleRecord(fileName, content.content) return content except: self.logger.error(traceback.format_exc()) self.logger.error("request url {0} failed ,check pls".format(url)) self.status = CRAWL_TIMEOUT raise Exception("Failed to load url (%s)" % url) def _save_captcha(self): """ 下载验证码,返回图片b64编码, """ self.logger.info("刷新验证码") try: codeContent = self.session.get(self.codeUrl, headers=IMGHEADERS).content self.logger.debug("验证码二进制内容:{0}".format(codeContent)[:50]) self.logger.info("下载验证码") self.status = NEED_BCODE with open( os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/"), 'wb') as f: f.write(codeContent) self.logger.info("验证码图片已保存!") bcode = base64.b64encode(codeContent) # self.logger.debug("{}".format(bcode)) return bcode except: s = traceback.format_exc() self.logger.error("刷新验证码错误:%s" % s) return PROGRAM_ERROR, {"error": "超时或代码异常"} def _captcha_recognize(self, imgpath): ''' 自动识别验证码 :param fileName: :return: ''' img = Image.open(imgpath) for i in range(10): code = image_to_string(img, lang='eng').encode('utf-8') if code.isalnum() and len(code) == 4: self.logger.info(code) return code self._save_captcha() time.sleep(0.05) def _ChioceIdent(self, flag): ''' 选择识别方式 :param flag: :return: ''' if flag == 'dmt': self._save_captcha() self.startTime = str(datetime.now()) dmt = damatuWeb.DamatuApi("huoyan2016", "123456") # self.imageCode = dmt.decodeUrl(self.captchaId_url, 200) self.imageCode = dmt.decode( os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/"), 200) self.finishTime = str(datetime.now()) elif flag == 'input': self._save_captcha() pngPath = os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/") self.logger.info("验证码路径:{0}".format(pngPath)) self.imageCode = raw_input("请输入验证码:") elif flag == 'auto': self.startTime = str(datetime.now()) self._save_captcha() self.logger.info("识别验证码") pngPath = os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/") self.imageCode = self._captcha_recognize(pngPath) self.logger.debug("验证码内容:{0}".format(self.imageCode)) self.finishTime = str(datetime.now()) # 返回给用户 通知redis 返回base64 elif flag == 'user': self.startTime = datetime.now() bcode64 = self._save_captcha() self.redisUtils.setNotify(token=self.token, val="10", decs="需要图片验证码", result="data:image/jpg;base64," + bcode64) # 向session中放入数据 while True: # 等待获取用户输入要的图片验证码值 dict_image_code = self.redisUtils.getNotify( self.token, "bcode") if dict_image_code is not None: self.imageCode = dict_image_code return else: self.finishTime = datetime.now() if abs(self.finishTime.minute - self.startTime.minute) >= 3: break # 爬虫等待用户输入图片验证码超时 self.logger.warn("爬虫等待用户输入图片验证码超时:%s" % self.token) time.sleep(1) else: self.status = NOT_NEED_BCODE self.logger.info(NOT_NEED_BCODE_DESC) def login(self, flag): # self._ChioceIdent(flag) if self.username or self.idcard: content = self._fetchUrl(url=self.LoginUrl, header=LOGINHEADERS, fileName="login.html") return content else: return '' @Time() def crawl(self, flag=""): CurTime = datetime.now().strftime("%Y-%m-%d") PastTime = (datetime.now() - timedelta(days=729)).strftime("%Y-%m-%d") try: # login # for i in range(10): content = self.login(flag) if content: info_re = re.compile(r'\/\*\*\/jQuery[0-9]+_[0-9]+\((.*)\)') info_detail = ifNotEmptyGetIndex( info_re.findall(content.content)) info_dict = eval(info_detail) date_info = ifNotEmptyGetIndex(info_dict.get('data', '')) if not date_info: date_info = {'result': ''} info_result = date_info.get('result', '') for detail in info_result: re_id = re.compile(r'id=([0-9]*)') shixinid = ifNotEmptyGetIndex( re_id.findall(detail.get("loc", ""))) self.GJJInfo.append( craw_dishonest.gaofa( unperformPart=detail.get("unperformPart", ""), # 被执行人的未履行部分 shixinid=shixinid, # 失信人ID sexy=detail.get("sexy", ""), # 性别 regDate=detail.get("regDate", ""), # 立案时间 publishDate=detail.get("publishDate", ""), # 发布时间 performedPart=detail.get("performedPart", ""), # 被执行人的履行部分 performance=detail.get("performance", ""), # 被执行人的履行情况 partyTypeName=detail.get("partyTypeName", ""), # 类型号 iname=detail.get("iname", ""), # 被执行人姓名/名称 disruptTypeName=detail.get("disruptTypeName", ""), # 失信被执行人行为具体情形 courtName=detail.get("courtName", ""), # 执行法院 caseCode=detail.get("caseCode", ""), # 案号 cardNum=detail.get("cardNum", ""), # 身份证号码/组织机构代码 businessEntity=detail.get("businessEntity", ""), # 法定代表人或负责人姓名 areaName=detail.get("areaName", ""), # 省份 age=detail.get("age", ""), # 年龄(企业默认为0) duty=detail.get("duty", ""), # 生效法律文书确定的义务 gistId=detail.get("gistId", ""), # 执行依据文号 gistUnit=detail.get("gistUnit", ""), # 做出执行依据单位 )) self.status = CRAWL_SUCCESS self.result.append(self.GJJInfo) else: self.status = CRAWL_SUCCESS self.result.append(self.GJJInfo) except: s = traceback.format_exc() self.logger.error("抓取错误:%s" % s) self.status, self.desc = EXEMPLE_IS_NOT_FULL, EXEMPLE_IS_NOT_FULL_DESC finally: try: if len(self.result) == 1 and self.status == CRAWL_SUCCESS: self.desc = CRAWL_SUCCESS_DESC # print self.result result_json = json.dumps(self.result[0], ensure_ascii=False) # print result_json self.redisUtils.setNotify(type=TYPEVALUE, token=self.token, val="1", decs="抓取成功!", result=result_json) # self.push_data(TYPEVALUE, self.userid, result_json) elif self.status == CRAWL_FAIL: self.desc = CRAWL_FAIL_DESC elif self.status == CRAWL_TIMEOUT: self.desc = CRAWL_TIMEOUT_DESC elif self.status == IDCARD_ERROR: self.desc = IDCARD_ERROR_DESC elif self.status == PASSWORD_ERROR: self.desc = PASSWORD_ERROR_DESC elif self.status == BCODE_ERROR: self.desc = BCODE_ERROR_DESC else: self.desc = PROGRAM_ERROR_DESC except Exception as e: s = traceback.format_exc() self.logger.error(s) finally: try: self.redisUtils.setNotify(type=TYPEVALUE, token=self.token, val=self.status, decs=self.desc) except Exception: s = traceback.format_exc() self.logger.error(s) def zipToStr(self, content): ''' 使用urllib2获取到的内容被压缩,需要进行解压缩 :param content: 需要解压的内容 :return: ''' try: conn = zlib.decompress(content, 16 + zlib.MAX_WBITS) return conn except: self.logger.error('解压缩响应内容出错%s' % traceback.format_exc()) raise Exception("解压缩响应内容出错%s" % traceback.format_exc())
class SpiderMain(craw_taxpayer_qualification): logger = logging.getLogger() def __init__(self, user): self.session = requests.session() self.redisUtils = RedisUtils() self.PROXYADDR = PROXYADDR self.bcode = NOT_NEED_BCODE self.status = CRAWL_READY self.desc = "" self.fpjy = user.get("fpjy", "") self.fpdm = user.get("fpdm", "") self.fphm = user.get("fphm", "") self.kprq = user.get("kprq", "") self.fpje = user.get("fpje", "") self.token = user.get("token", "") self.userid = user.get("userid", "") self.fpdm_area = self.fpdm[0:4] self.fpdm_url = AREA.get(self.fpdm_area, "") self.suiji = str(int(round(time.time() * 1000))) self.codeUrl = self.fpdm_url + '/WebQuery/yzmQuery?callback=jQuery110204713398352365614_' + self.suiji + '&fpdm=' + self.fpdm + '&r=' + str( '%.16f' % (random.random())) + '&v=V1.0.04_001' + '&nowtime=' + str( int(round(time.time() * 1000)) ) + '&publickey=B8EE27C2CFEABABBD1DB92F4D84E4EA3&_=' + str( int(round(time.time() * 1000))) self.result = user.get("result", "") self.GJJInfo = [] self.PerInfo = {} # 加入当次代理 # self.proxy = self._proxy() def _proxy(self): proxy = self.session.get(self.PROXYADDR).content return {"http": "http://" + proxy, "https": "http://" + proxy} def _errhtmlRecord(self, content): ''' 错误页面保存 ''' self.logger.info("保存错页内容") try: filename = str(uuid.uuid1()) + ".html" sampleDir = os.path.join(os.path.dirname(__file__), "errorHtml").replace("\\", "/") os.path.exists(sampleDir) or os.mkdir(sampleDir) with open("%s/%s" % (sampleDir, filename), 'w') as f: f.write(content) self.logger.debug("已保存错页内容到{0}".format(filename)) except Exception: self.status = PROGRAM_ERROR s = traceback.format_exc() self.logger.info("保存错页出错") self.logger.warn("{0}".format(s)) def _sampleRecord(self, filename, content): ''' 保存网页内容 ''' self.logger.info("保存网页内容") try: sampleDir = os.path.join(os.path.dirname(__file__), "sample/").replace("\\", "/") os.path.exists(sampleDir) or os.mkdir(sampleDir) with open("%s/%s" % (sampleDir, filename), 'w') as f: f.write(content) self.logger.debug("已保存网页内容到{0}".format(sampleDir)) except Exception: self.status = PROGRAM_ERROR s = traceback.format_exc() self.logger.info("保存网页出错") self.logger.warn("{0}".format(s)) def _fetchUrl(self, url, data=None, header=None, timeout=TIMEOUT, fileName=None, proxy=None): ''' 抓取方法 ''' self.logger.info("开始抓取 {0}".format(url)) if header: headers = header self.logger.debug("伪装头:{0}".format(headers)) else: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0" } self.logger.debug("伪装头:{0}".format(headers)) for ir in range(REQUEST_RETRY): try: self.logger.debug("第{0}次 抓取".format(ir)) if data: if proxy: content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy) self.logger.debug( "POST url:{0}, data:{1}, proxy: {2}".format( url, data, proxy)) else: content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False) self.logger.debug("POST url:{0}, data:{1}".format( url, data)) else: if proxy: content = self.session.get(url, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy) self.logger.debug("POST url:{0}, proxy: {1}".format( url, proxy)) else: content = self.session.get(url, data=data, headers=headers, timeout=timeout, allow_redirects=False) self.logger.debug("Get url:{0}".format(url)) if fileName and SAMPLEFLAG: self._sampleRecord(fileName, content.content) info = re.compile(r'jQuery[0-9]+_[0-9]+\((.*)\)') info_list = info.findall(content.content)[0] content_dic = eval(info_list) return content_dic except: self.logger.error(traceback.format_exc()) self.logger.error("request url {0} failed ,check pls".format(url)) self.status = CRAWL_TIMEOUT raise Exception("Failed to load url (%s)" % url) def _save_captcha(self): """ 下载验证码,返回图片b64编码, """ self.logger.info("刷新验证码") try: content = self.session.get(self.codeUrl, headers=IMGHEADERS, verify=False) info = re.compile(r'jQuery[0-9]+_[0-9]+\((.*)\)') info_list = ifNotEmptyGetIndex(info.findall(content.content)) dic = eval(info_list) codeContent = dic.get('key1', '') self.logger.debug("验证码二进制内容:{0}".format(codeContent)[:50]) self.logger.info("下载验证码") self.status = NEED_BCODE chose_id = dic.get('key4', '') chose_info = COULOR.get(chose_id, '') codeContent1 = base64.b64decode(codeContent) with open( os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/"), 'wb') as f: f.write(codeContent1) bcode = codeContent self.logger.info("验证码图片已保存!") if chose_info != '请输入验证码内容': im = Image.open( os.path.join(os.path.dirname(__file__), "captcha.png")) box = im.copy() u = Image.new('RGB', (90, 55)) u.paste(box, (0, 0)) key_id = Image.open( os.path.join(os.path.dirname(__file__), chose_info)) key_box = key_id.copy() u.paste(key_box, (0, 35)) u.save(os.path.join(os.path.dirname(__file__), "card.png")) with open(os.path.join(os.path.dirname(__file__), "card.png")) as c: bcode = base64.b64encode(c.read()) self.data = urllib.quote(dic.get('key2', '').encode('utf8')).replace( '%20', '+') self.index = dic.get('key3', '') return bcode except: s = traceback.format_exc() self.logger.error("刷新验证码错误:%s" % s) return PROGRAM_ERROR, {"error": "超时或代码异常"} def _captcha_recognize(self, imgpath): ''' 自动识别验证码 :param fileName: :return: ''' img = Image.open(imgpath) for i in range(10): code = image_to_string(img, lang='eng').encode('utf-8') if code.isalnum() and len(code) == 4: self.logger.info(code) return code self._save_captcha() time.sleep(0.05) def _ChioceIdent(self, flag): ''' 选择识别方式 :param flag: :return: ''' if flag == 'dmt': self._save_captcha() self.startTime = str(datetime.now()) dmt = damatuWeb.DamatuApi("huoyan2016", "123456") self.imageCode = dmt.decode( os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/"), 200) self.finishTime = str(datetime.now()) elif flag == 'input': self._save_captcha() pngPath = os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/") self.logger.info("验证码路径:{0}".format(pngPath)) self.imageCode = raw_input("请输入验证码:") elif flag == 'auto': self.startTime = str(datetime.now()) self._save_captcha() self.logger.info("识别验证码") pngPath = os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/") self.imageCode = self._captcha_recognize(pngPath) self.logger.debug("验证码内容:{0}".format(self.imageCode)) self.finishTime = str(datetime.now()) # 返回给用户 通知redis 返回base64 elif flag == 'user': self.startTime = datetime.now() bcode64 = self._save_captcha() self.redisUtils.setNotify(token=self.token, val="10", decs="需要图片验证码", result="data:image/jpg;base64," + bcode64) # 向session中放入数据 while True: # 等待获取用户输入要的图片验证码值 dict_image_code = self.redisUtils.getNotify( self.token, "bcode") # print dict_image_code if dict_image_code is not None: # print dict_image_code self.redisUtils.DelNotify(self.token, "bcode") self.imageCode = dict_image_code break else: self.finishTime = datetime.now() if abs(self.finishTime.minute - self.startTime.minute) >= 2: break time.sleep(1) else: self.status = NOT_NEED_BCODE self.logger.info(NOT_NEED_BCODE_DESC) def login(self, flag): self._ChioceIdent(flag) if self.fpje: # LoginUrl = self.fpdm_url+'/WebQuery/query?callback=jQuery110204713398352365614_'+self.suiji+'&fpdm='+self.fpdm+'&fphm='+self.fphm+'&kprq='+self.kprq+'&fpje='+self.fpje+'&fplx=01&yzm='+self.imageCode+'&yzmSj='+self.data+'&index='+self.index+'&iv=31205b0a9543d0cf808f6a3a19915858'+'&salt=bc1792b6b19a7ceb8f124fc75e658cfe'+'&publickey=89FF3E78F5B40654133317B104D81634&_='+str(int(round(time.time() * 1000))) LoginUrl = self.fpdm_url + '/WebQuery/invQuery?callback=jQuery110204713398352365614_' + self.suiji + '&fpdm=' + self.fpdm + '&fphm=' + self.fphm + '&kprq=' + self.kprq + '&fpje=' + self.fpje + '&fplx=01&yzm=' + self.imageCode + '&yzmSj=' + self.data + '&index=' + self.index + '&iv=31205b0a9543d0cf808f6a3a19915858' + '&salt=bc1792b6b19a7ceb8f124fc75e658cfe' + '&publickey=89FF3E78F5B40654133317B104D81634&_=' + str( int(round(time.time() * 1000))) content = self._fetchUrl(url=LoginUrl, header=IMGHEADERS, fileName="login.html") elif self.fpjy: LoginUrl = self.fpdm_url + '/WebQuery/invQuery?callback=jQuery110204713398352365614_' + self.suiji + '&fpdm=' + self.fpdm + '&fphm=' + self.fphm + '&kprq=' + self.kprq + '&fpje=' + self.fpjy + '&fplx=04&yzm=' + self.imageCode + '&yzmSj=' + self.data + '&index=' + self.index + '&iv=31205b0a9543d0cf808f6a3a19915858' + '&salt=bc1792b6b19a7ceb8f124fc75e658cfe' + '&publickey=89FF3E78F5B40654133317B104D81634&_=' + str( int(round(time.time() * 1000))) content = self._fetchUrl(url=LoginUrl, header=IMGHEADERS, fileName="login.html") else: self.logger.debug("没有您查询的方式") content = {"key1": "009"} return content @Time() def crawl(self, flag=""): CurTime = datetime.now().strftime("%Y-%m-%d") PastTime = (datetime.now() - timedelta(days=729)).strftime("%Y-%m-%d") try: # login for i in range(5): content = self.login(flag) if content['key1'] == "007": self.logger.info("验证码失效") self.status, self.desc = BCODE_ERROR, CARD_OUT_DESC continue elif content['key1'] == "008": self.logger.info("验证码错误") self.status, self.desc = BCODE_ERROR, BCODE_ERROR_DESC continue elif content["key1"] == "002": self.logger.info("当日查询次数已超过5次") self.logger.info(PASSWORD_ERROR_DESC) self.status, self.desc = PASSWORD_ERROR, PASSWORD_ERROR_DESC break elif content['key1'] == "009": self.logger.info("查无此票") self.logger.info(EXEMPLE_IS_NOT_FULL) self.status, self.desc = EXEMPLE_IS_NOT_FULL, EXEMPLE_IS_NOT_FULL_DESC break elif content["key1"] == "001": self.logger.info("登陆成功:%s" % content['key1']) a_json = json.dumps(content, ensure_ascii=False) bbb = json.loads(a_json, encoding="gbk") sales_name = bbb.get( 'key2', '').encode('utf8').split('≡')[6] # 销售方名字【6】 purchaser_taxpayer_id = bbb.get( 'key2', '').encode('utf8').split('≡')[3] # 购买方纳税人识别号【3】 purchaser_bank_account = bbb.get( 'key2', '').encode('utf8').split('≡')[5] # 购买方开户行级账号【5】 sales_taxpayer_id = bbb.get( 'key2', '').encode('utf8').split('≡')[7] # 销售方纳税识别号【7】 sales_add_phone = bbb.get( 'key2', '').encode('utf8').split('≡')[8] # 销售方地址电话【8】 check_number = bbb.get( 'key2', '').encode('utf8').split('≡')[17] # 校验号 sales_bank_account = bbb.get( 'key2', '').encode('utf8').split('≡')[9] # 销售方开户行及账号【9】 purchaser_add_phone = bbb.get( 'key2', '').encode('utf8').split('≡')[4] # 购买方地址电话[4] purchaser_name = bbb.get( 'key2', '').encode('utf8').split('≡')[2] # 购买方名称[2] service_name = bbb.get( 'key3', '').encode('utf8').split('█')[0] # 服务名称 specification = bbb.get( 'key3', '').encode('utf8').split('█')[1] # 规格型号 unit = bbb.get('key3', '').encode('utf8').split('█')[2] # 单位 quantity = bbb.get('key3', '').encode('utf8').split('█')[3] # 数量 unit_price = bbb.get('key3', '').encode('utf8').split('█')[4] # 单价 amount = bbb.get('key3', '').encode('utf8').split('█')[5] # 金额 tax_rate = bbb.get('key3', '').encode('utf8').split('█')[6] # 税率 tax = bbb.get('key3', '').encode('utf8').split('█')[7] # 税额 if self.fpjy: machine_code = bbb.get( 'key2', '').encode('utf8').split('≡')[15] #机器码 else: machine_code = '' self.PerInfo = craw_taxpayer_qualification.baseinfo( sales_name=sales_name, purchaser_taxpayer_id=purchaser_taxpayer_id, purchaser_bank_account=purchaser_bank_account, sales_taxpayer_id=sales_taxpayer_id, sales_add_phone=sales_add_phone, sales_bank_account=sales_bank_account, purchaser_add_phone=purchaser_add_phone, purchaser_name=purchaser_name, service_name=service_name, specification=specification, unit=unit, quantity=quantity, unit_price=unit_price, amount=amount, tax_rate=tax_rate, tax=tax, invoice_code=self.fpdm, # 发票代码 invoice_number=self.fphm, # 发票号码 billing_date=self.kprq, # 开票日期 check_number=check_number, # 校验号 machine_code=machine_code, # 机器码 before_tax=self.fpje, #税前金额 total_tax='%.2f' % (float(amount) + float(tax)), # 纳税合计 ) self.GJJInfo.append(self.PerInfo) self.status = CRAWL_SUCCESS self.result.append(self.GJJInfo) break else: self.logger.info("查询失败:%s" % content["key1"]) self.logger.info(IDCARD_ERROR_DESC) self.status, self.desc = IDCARD_ERROR, IDCARD_ERROR_DESC break except: s = traceback.format_exc() self.logger.error("抓取错误:%s" % s) self.status, self.desc = self.status, PROGRAM_ERROR_DESC finally: try: if len(self.result) == 1 and self.status == CRAWL_SUCCESS: self.desc = CRAWL_SUCCESS_DESC # print self.result result_json = json.dumps(self.result[0], ensure_ascii=False) # print result_json self.redisUtils.setNotify(type=TYPEVALUE, token=self.token, val="1", decs="抓取成功!", result=result_json) # self.push_data(TYPEVALUE, self.userid, result_json) elif self.status == CRAWL_FAIL: self.desc = CRAWL_FAIL_DESC elif self.status == CRAWL_TIMEOUT: self.desc = CRAWL_TIMEOUT_DESC elif self.status == IDCARD_ERROR: self.desc = IDCARD_ERROR_DESC elif self.status == PASSWORD_ERROR: self.desc = PASSWORD_ERROR_DESC elif self.status == BCODE_ERROR: self.desc = BCODE_ERROR_DESC else: self.desc = PROGRAM_ERROR_DESC except Exception as e: s = traceback.format_exc() self.logger.error(s) finally: try: self.redisUtils.setNotify(type=TYPEVALUE, token=self.token, val=self.status, decs=self.desc) except Exception: s = traceback.format_exc() self.logger.error(s) def zipToStr(self, content): ''' 使用urllib2获取到的内容被压缩,需要进行解压缩 :param content: 需要解压的内容 :return: ''' try: conn = zlib.decompress(content, 16 + zlib.MAX_WBITS) return conn except: self.logger.error('解压缩响应内容出错%s' % traceback.format_exc()) raise Exception("解压缩响应内容出错%s" % traceback.format_exc())
class SpiderMain(hypc_soufun): logger = logging.getLogger() def __init__(self, user): self.session = requests.session() self.redisUtils = RedisUtils() self.PROXYADDR = PROXYADDR self.bcode = NOT_NEED_BCODE self.status = CRAWL_READY self.desc = "" self.keyword = user.get("keyword", "") # self.gjjaccnum = self.username if len(self.username) <= 15 else "" # self.pwd = user.get("password", "") self.age = FANGAGE.get(user.get('age',''),'') or FANGAGE.get(user.get('year',''), '') self.token = user.get("token", "") self.flower = LOUCENG.get(user.get('flower',''),'') or LOUCENG.get(user.get('floor',''),'') self.hu_type = HUXING.get(user.get('hu_type',''),'') or HUXING.get(user.get('housetype',''),'') # self.userid = user.get("userid", "") self.startUrl = "http://esf.nb.fang.com/NewSecond/sale_info/searchlist_new2014.aspx" self.hostUrl = "http://esf.nb.fang.com/" self.result = user.get("result", "") self.GJJInfo = [] # self.proxy = {'http':'http://143.0.188.8:80','https':'https://143.0.188.8:80'} # 加入当次代理 # self.proxy = self._proxy() def _proxy(self): proxy = self.session.get(self.PROXYADDR).content # proxy = self.session.get('http://192.168.30.185:13579/ip').content return {"http": "http://" + proxy, "https": "http://" + proxy} def ifNotEmptyGetIndex(self,somelist,index=0): """check to see it's not empty""" if somelist: return somelist[index] else: return '' def _errhtmlRecord(self, content): ''' 错误页面保存 ''' self.logger.info("保存错页内容") try: filename = str(uuid.uuid1()) + ".html" sampleDir = os.path.join(os.path.dirname(__file__), "errorHtml").replace("\\", "/") os.path.exists(sampleDir) or os.mkdir(sampleDir) with open("%s/%s" % (sampleDir, filename), 'w') as f: f.write(content) self.logger.debug("已保存错页内容到{0}".format(filename)) except Exception: self.status = PROGRAM_ERROR s = traceback.format_exc() self.logger.info("保存错页出错") self.logger.warn("{0}".format(s)) def _sampleRecord(self, filename, content): ''' 保存网页内容 ''' self.logger.info("保存网页内容") try: sampleDir = os.path.join(os.path.dirname(__file__), "sample/").replace("\\", "/") os.path.exists(sampleDir) or os.mkdir(sampleDir) with open("%s/%s" % (sampleDir, filename), 'w') as f: f.write(content) self.logger.debug("已保存网页内容到{0}".format(sampleDir)) except Exception: self.status = PROGRAM_ERROR s = traceback.format_exc() self.logger.info("保存网页出错") self.logger.warn("{0}".format(s)) def _fetchUrl(self, url, data=None, header=None, timeout=TIMEOUT, fileName=None, proxy=None): ''' 抓取方法 ''' self.logger.info("开始抓取 {0}".format(url)) if header: headers = header self.logger.debug("伪装头:{0}".format(headers)) else: headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"} self.logger.debug("伪装头:{0}".format(headers)) for ir in range(REQUEST_RETRY): try: self.logger.debug("第{0}次 抓取".format(ir)) if data: if proxy: content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy) self.logger.debug("POST url:{0}, data:{1}, proxy: {2}".format(url, data, proxy)) else: content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False) self.logger.debug("POST url:{0}, data:{1}".format(url, data)) else: if proxy: content = self.session.get(url, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy) self.logger.debug("GET url:{0}, proxy: {1}".format(url, proxy)) else: content = self.session.get(url, data=data, headers=headers, timeout=timeout, allow_redirects=False) self.logger.debug("Get url:{0}".format(url)) if fileName and SAMPLEFLAG: self._sampleRecord(fileName, content.content) return content except: self.logger.error(traceback.format_exc()) self.logger.error("request url {0} failed ,check pls".format(url)) self.status = CRAWL_TIMEOUT raise Exception("Failed to load url (%s)" % url) def _save_captcha(self): """ 下载验证码,返回图片b64编码, """ self.logger.info("刷新验证码") try: codeContent = self.session.get(self.codeUrl, headers=IMGHEADERS).content self.logger.debug("验证码二进制内容:{0}".format(codeContent)[:50]) self.logger.info("下载验证码") self.status = NEED_BCODE with open(os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/"), 'wb') as f: f.write(codeContent) self.logger.info("验证码图片已保存!") bcode = base64.b64encode(codeContent) # self.logger.debug("{}".format(bcode)) return bcode except: s = traceback.format_exc() self.logger.error("刷新验证码错误:%s" % s) return PROGRAM_ERROR, {"error": "超时或代码异常"} def _captcha_recognize(self,imgpath): ''' 自动识别验证码 :param fileName: :return: ''' img = Image.open(imgpath) for i in range(10): code = image_to_string(img, lang='eng').encode('utf-8') if code.isalnum() and len(code) == 4: self.logger.info(code) return code self._save_captcha() time.sleep(0.05) def _ChioceIdent(self, flag): ''' 选择识别方式 :param flag: :return: ''' if flag == 'dmt': self._save_captcha() self.startTime = str(datetime.now()) dmt = damatuWeb.DamatuApi("huoyan2016", "123456") # self.imageCode = dmt.decodeUrl(self.captchaId_url, 200) self.imageCode = dmt.decode(os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/"), 200) self.finishTime = str(datetime.now()) elif flag == 'input': self._save_captcha() pngPath = os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/") self.logger.info("验证码路径:{0}".format(pngPath)) self.imageCode = raw_input("请输入验证码:") elif flag == 'auto': self.startTime = str(datetime.now()) self._save_captcha() self.logger.info("识别验证码") pngPath = os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/") self.imageCode = self._captcha_recognize(pngPath) self.logger.debug("验证码内容:{0}".format(self.imageCode)) self.finishTime = str(datetime.now()) # 返回给用户 通知redis 返回base64 elif flag == 'user': self.startTime = datetime.now() bcode64 = self._save_captcha() self.redisUtils.setNotify(token=self.token, val="10",decs="需要图片验证码",result="data:image/jpg;base64,"+bcode64) # 向session中放入数据 while True: # 等待获取用户输入要的图片验证码值 dict_image_code = self.redisUtils.getNotify(self.token, "bcode") if dict_image_code is not None: self.imageCode = dict_image_code return else: self.finishTime = datetime.now() if abs(self.finishTime.minute - self.startTime.minute)>=3: break # 爬虫等待用户输入图片验证码超时 self.logger.warn("爬虫等待用户输入图片验证码超时:%s" % self.token) time.sleep(1) else: self.status = NOT_NEED_BCODE self.logger.info(NOT_NEED_BCODE_DESC) def prase_detail(self, detail_content): detail_info = etree.HTML(detail_content.text) sum_price = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='trl-item_top']/div/i/text()"))# 总价 first_pay = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div/div[@class='trl-item_top']/div[2]/text()"))# 首付 # month_pay = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[2]/div[3]/a/div/span/i/text()"))# 月供 house_type = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[2]/div[1]/div[1]/text()")).strip()# 户型 construction_area = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[2]/div[2]/div[1]/text()"))# 建筑面积 unit_price = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[2]/div[3]/div[1]/text()"))# 单价 orientation = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[3]/div[1]/div[1]/text()"))# 朝向 floor = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[3]/div[2]/div[1]/text()"))+self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[3]/div[2]/div[2]/text()"))# 楼层 decoration = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[3]/div[3]/div[1]/text()"))# 装修 district = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[4]/div[1]/div[2]/a[1]/text()"))# 小区 quyu = detail_info.xpath("//div[@id='address']/a")# 区域 quyu_list = [] for qu in quyu: quyu_list.append(self.ifNotEmptyGetIndex(qu.xpath("text()")).strip()) area = ','.join(quyu_list) # 区域 contact_person = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@id='agantesfxq_C04_02']/text()"))# 联系人 economic_company = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='tjcont-list-cline2']/span[2]/text()"))# 经纪公司 phone = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='tjcont-list-cline3 font16']/span/text()"))# 电话 #房源信息 build_age_list = re.compile(r'<span class=\"lab\">建筑年代</span>[\s]*<span class=\"rcont\">(.*)</span>') build_age = self.ifNotEmptyGetIndex(build_age_list.findall(str(detail_content.text))) elevator_list = re.compile(r'<span class=\"lab\">有无电梯</span>[\s]*<span class=\"rcont\">(.*)</span>') elevator = self.ifNotEmptyGetIndex(elevator_list.findall(str(detail_content.text))) property_right_list = re.compile(r'<span class="lab">产权性质</span>[\s]*<span class="rcont">(.*)</span>') property_right = self.ifNotEmptyGetIndex(property_right_list.findall(str(detail_content.text))) category_list = re.compile(r'<span class="lab">住宅类别</span>[\s]*<span class="rcont">(.*)</span>') category = self.ifNotEmptyGetIndex(category_list.findall(str(detail_content.text))) build_structure_list = re.compile(r'<span class="lab">建筑结构</span>[\s]*<span class="rcont">(.*)</span>') build_structure = self.ifNotEmptyGetIndex(build_structure_list.findall(str(detail_content.text))) build_category_list = re.compile(r'<span class="lab">建筑类别</span>[\s]*<span class="rcont">(.*)</span>') build_category = self.ifNotEmptyGetIndex(build_category_list.findall(str(detail_content.text))) list_time_list = re.compile(r'<span class="lab">挂牌时间</span>[\s]*<span class="rcont">[\s]*(.*)[\s]*.*</span>') list_time = self.ifNotEmptyGetIndex(list_time_list.findall(str(detail_content.text))).strip() fang_info = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='content-item'][2]/div[2]/div/div/div/text()")) # 房源描述 # 小区信息 reference_price = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div/div[1]/span[2]/i/text()"))#参考均价 district_than_year = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div/div[2]/span[2]/em/span/text()")) # 同比去年 district_than_month = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div/div[3]/span[2]/em/span/text()")) # 环比上月 district_property_type = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[1]/span[2]/text()")).strip() # 物业类型 district_property_costs = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[2]/span[2]/text()")).strip() # 物业费用 district_build_type = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[3]/span[2]/text()")).strip() # 建筑类型 district_build_age = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[4]/span[2]/text()")).strip() # 建筑年代 district_green_rate = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[5]/span[2]/text()")).strip() # 绿化率 district_volume_tate = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[6]/span[2]/text()")).strip() # 容积率 district_diversion = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[7]/span[2]/text()")).strip() # 人车分流 self.GJJInfo.append(hypc_soufun.baseinfo( sum_price=sum_price, first_pay=first_pay, house_type=house_type, construction_area=construction_area, unit_price=unit_price, orientation=orientation, floor=floor, decoration=decoration, district=district, area=area, contact_person=contact_person, economic_company=economic_company, phone=phone, build_age=build_age, elevator=elevator, property_right=property_right, category=category, build_structure=build_structure, build_category=build_category, list_time=list_time, fang_info=fang_info, reference_price=reference_price, district_than_year=district_than_year, district_than_month=district_than_month, district_property_type=district_property_type, district_property_costs=district_property_costs, district_build_type=district_build_type, district_build_age=district_build_age, district_green_rate=district_green_rate, district_volume_tate=district_volume_tate, district_diversion=district_diversion, )) def login(self,flag): # self._ChioceIdent(flag) '''there are divided into bulk and real time''' if self.keyword: LoginData = { 'input_keyw1':self.keyword, 'city':'宁波', 'district':'', 'purpose':'סլ', 'room':'', 'pricemin':'', 'pricemax':'', 'trackLine':'', 'keyword':self.keyword, 'renttype':'', 'strCity':'宁波', 'strDistrict':'', 'Strprice':'', 'StrNameKeyword':self.keyword, 'houseType':'', 'isnewhouse':0, 'isFinder':0, 'fromdianshang':'', 'fromhouseprom':'', 'fromesfchengjiao':'' } # content = self._fetchUrl(url=self.startUrl, header=LOGINHEADERS, proxy=self.proxy, data=LoginData, fileName="login.html") content = self._fetchUrl(url=self.startUrl, header=LOGINHEADERS, data=LoginData, fileName="login.html") return content else: # content = self._fetchUrl(url=self.hostUrl, header=PERHEADERS, proxy=self.proxy, fileName="login.html") content = self._fetchUrl(url=self.hostUrl, header=PERHEADERS, fileName="login.html") return content @Time() def crawl(self, flag=""): CurTime = datetime.now().strftime("%Y-%m-%d") PastTime = (datetime.now() - timedelta(days=729)).strftime("%Y-%m-%d") try: # login # for i in range(10): content = self.login(flag) # try: if 'keyword=' not in content.text.encode('utf8') and content.text.encode('utf8'): self.logger.info("获取信息成功:%s"%'good info') secondUrl = 'http://esf.nb.fang.com/house/'+self.hu_type+self.flower+self.age+'kw'+'/' # content = self._fetchUrl(url=secondUrl, header=PERHEADERS, proxy=self.proxy, fileName="person.html") content = self._fetchUrl(url=secondUrl, header=PERHEADERS, fileName="person.html") infohtml = etree.HTML(content.text) num_info = str(self.ifNotEmptyGetIndex(infohtml.xpath("//div[@class='fanye gray6']/span/text()")).encode('utf8')) if num_info: zong = re.compile(r'共(\d*)页') num = zong.search(num_info).group(1)#提取页数 for i in range(int(num)): # proxy1 = self._proxy() fang_url = 'http://esf.nb.fang.com/house/'+self.hu_type+self.flower+self.age+'i3'+str(i+1)+'-'+'kw'+'/' # list_content = self._fetchUrl(url=fang_url, header=PERHEADERS, proxy=self.proxy, fileName="list.html") list_content = self._fetchUrl(url=fang_url, header=PERHEADERS, fileName="list.html") list_info = etree.HTML(list_content.text) html = list_info.xpath("//div[@class='houseList']/dl") for ht in html[:-1]: a = self.ifNotEmptyGetIndex(ht.xpath("dd[@class='info rel floatr']/p[1]/a/@href")) if a: detail_url = 'http://esf.nb.fang.com' + str(a) # 具体的房产页信息 detail_content = self._fetchUrl(url=detail_url, header=PERHEADERS, fileName="detail.html") self.prase_detail(detail_content) self.status= CRAWL_SUCCESS self.result.append(self.GJJInfo) # a = json.dumps(self.result[0],ensure_ascii=False) # print a # except: elif not content.text.encode('utf8'): self.logger.info("获取信息成功:%s"%content) # data = urllib.quote(self.keyword.decode(sys.stdin.encoding).encode('gb2312')) data = urllib.quote(self.keyword.decode('utf-8').encode('gb2312')) secondUrl = 'http://esf.nb.fang.com/house/'+self.hu_type+self.flower+self.age+'kw'+data.lower()+'/' # content = self._fetchUrl(url=secondUrl, header=PERHEADERS, proxy=self.proxy, fileName="person.html") content = self._fetchUrl(url=secondUrl, header=PERHEADERS, fileName="person.html") infohtml = etree.HTML(content.text) num_info = str(self.ifNotEmptyGetIndex(infohtml.xpath("//div[@class='fanye gray6']/span/text()")).encode('utf8')) if num_info: zong = re.compile(r'共(\d*)页') num = zong.search(num_info).group(1)#提取页数 for i in range(int(num)): # proxy1 = self._proxy() fang_url = 'http://esf.nb.fang.com/house/'+self.hu_type+self.flower+self.age+'i3'+str(i+1)+'-'+'kw'+data.lower()+'/' # list_content = self._fetchUrl(url=fang_url, header=PERHEADERS, proxy=self.proxy, fileName="list.html") list_content = self._fetchUrl(url=fang_url, header=PERHEADERS, fileName="list.html") list_info = etree.HTML(list_content.text) html = list_info.xpath("//div[@class='houseList']/dl") for ht in html: a = self.ifNotEmptyGetIndex(ht.xpath("dd[@class='info rel floatr']/p[1]/a/@href")) if a: detail_url = 'http://esf.nb.fang.com' + str(a) # 具体的房产页信息 detail_content = self._fetchUrl(url=detail_url, header=PERHEADERS, fileName="detail.html") self.prase_detail(detail_content) self.status= CRAWL_SUCCESS else: self.status= CRAWL_SUCCESS self.GJJInfo.append(hypc_soufun.baseinfo()) self.result.append(self.GJJInfo) # a = json.dumps(self.result[0],ensure_ascii=False) # print a else: self.logger.info("信息失败:%s" % 'bad info') self.logger.info(IDCARD_ERROR_DESC) self.status, self.desc = IDCARD_ERROR, IDCARD_ERROR_DESC self.result.append(self.GJJInfo) # break except: s = traceback.format_exc() self.logger.error("抓取错误:%s" % s) self.status, self.desc = self.status, PROGRAM_ERROR_DESC finally: try: if len(self.result) == 1 and self.status == CRAWL_SUCCESS: self.desc = CRAWL_SUCCESS_DESC # print self.result result_json = json.dumps(self.result[0], ensure_ascii=False) # print result_json self.redisUtils.setNotify(type=TYPEVALUE,token=self.token, val="1", decs="抓取成功!", result=result_json) # self.push_data(TYPEVALUE, self.userid, result_json) elif self.status == CRAWL_FAIL: self.desc = CRAWL_FAIL_DESC elif self.status == CRAWL_TIMEOUT: self.desc = CRAWL_TIMEOUT_DESC elif self.status == IDCARD_ERROR: self.desc = IDCARD_ERROR_DESC elif self.status == PASSWORD_ERROR: self.desc = PASSWORD_ERROR_DESC elif self.status == BCODE_ERROR: self.desc = BCODE_ERROR_DESC else: self.desc = PROGRAM_ERROR_DESC except Exception as e: s = traceback.format_exc() self.logger.error(s) finally: try: self.redisUtils.setNotify(type=TYPEVALUE, token=self.token, val=self.status, decs=self.desc) except Exception: s = traceback.format_exc() self.logger.error(s) def zipToStr(self, content): ''' 使用urllib2获取到的内容被压缩,需要进行解压缩 :param content: 需要解压的内容 :return: ''' try: conn = zlib.decompress(content, 16 + zlib.MAX_WBITS) return conn except: self.logger.error('解压缩响应内容出错%s' % traceback.format_exc()) raise Exception("解压缩响应内容出错%s" % traceback.format_exc())