def process_exception(self, request, exception, spider): if isinstance(exception, self.EXCEPTIONS_TO_RETRY): if not isinstance(exception, TimeoutError): spider.is_change_proxy = True logger.error(u"中间件切换代理ip") logger.error(exception) return self._retry(request, exception, spider)
def process_response(self, request, response, spider): if response.status in self.retry_http_codes: spider.is_change_proxy = True # building 爬虫,遇到无法处理的数据 if spider.name == "building": logger.error(u"中间件切换代理ip:%s,%s" % (response.status, spider.building.get("id"))) if not self.handle_error_building(spider.building.get('id')): return self._retry(request, response_status_message(response.status), spider) or response else: spider.building = self.handle_sql(spider.building_sql) if not spider.building: raise CloseSpider(u"数据收集完成,爬虫关闭") request = request.replace(body=json.dumps({"buildingid": spider.building.get("id")})) return request else: return response
def commit(self, sql, param=None): cursor = self.__get_connect__() try: if param: cursor.execute(sql, param) else: cursor.execute(sql) self.__conn__.commit() return cursor.lastrowid except BaseException as e: # db_pool_logger.error("sql:%s" % sql) # db_pool_logger.error("sql param:%s" % param) # db_pool_logger.error(e) logger.error(e) logger.error("sql:%s" % (sql % param)) finally: cursor.close() self.__conn__.close()
def find(self, sql, param=None, sql_analysis=True): cursor = self.__get_connect__() # self.__get_sql_query_param__(sql) try: if param: cursor.execute(sql, param) else: cursor.execute(sql) result = cursor.fetchall() if sql_analysis: list_query_param = self.__get_sql_query_param__(sql) data_list = list() for item in result: data_dict = dict() for (index, v) in enumerate(list_query_param): data_dict[v.strip()] = item[index] data_list.append(data_dict) return data_list else: return result except BaseException as e: # db_pool_logger.error("sql:%s" % sql) # db_pool_logger.error("sql param:%s" % param) # db_pool_logger.error(e) logger.error("sql:%s" % sql) logger.error("sql param:%s" % param) logger.error(e) finally: cursor.close() self.__conn__.close()
def work(self): delete_logs() options = webdriver.ChromeOptions() options.add_argument("headless") web_driver_manager = WebDriverManager(3, "chrome", options) validate_driver = web_driver_manager.get_web_driver() for region in get_all_region(): now_page = region.get("now_page") while True: real_estate_driver = web_driver_manager.get_web_driver() # 获得楼盘 url = self.base_url % (region.get("region").encode("utf8"), now_page) if not real_estate_driver.send_url(url, "pre"): logger.info( region.get("region").encode("utf8") + "房产信息收集完成") update_region(region.get("id"), now_page) break # 请求完成之后页数就加1 logger.info(region.get("region") + ":" + str(now_page)) now_page += 1 real_estate = real_estate_driver.find_element_by_tag_name( "pre").text # 关闭网页 web_driver_manager.destory_web_driver( real_estate_driver.get_id()) if not real_estate: logger.info( region.get("region").encode("utf8") + "房产信息收集完成") update_region(region.get("id"), 1) break # 解析楼盘 json_rep = json.loads( real_estate.encode("utf8").replace("[", "").replace( "]", "").replace("'", "\"")) list_json_rep = [json_rep] for item in list_json_rep: try: # 查询该楼盘出售情况,全部售完的就跳过 real_estate_name = item.get("ZPROJECT") real_estate_result = get_real_estate_sale_status( real_estate_name=real_estate_name) if real_estate_result and real_estate_result.get("house_total_count") != 0 \ and real_estate_result.get("house_sell_out_count") != 0 \ and real_estate_result.get("house_total_count") == real_estate_result.get("house_sell_out_count"): continue # 新增或查询楼盘 real_estate = get_real_estate(real_estate_name, region.get("id")) if real_estate: real_estate_id = real_estate.get("id") else: real_estate = RealEstate() real_estate.name = real_estate_name real_estate.region = region.get("id") real_estate.address = item.get("F_ADDR") real_estate.developer = item.get("ENTERPRISENAME") real_estate.sale_building = item.get("F_BLOCK") real_estate.sale_count = item.get("NUM") real_estate.source_id = WebSource.RealEstate real_estate.house_total_count = 0 real_estate.house_sell_out_count = 0 real_estate_id = real_estate.__add__() # 大楼数据 build_name = item.get("F_BLOCK").split(",") build_id = item.get("BUILDID").split(",") build_register = item.get("F_REGISTER_DATE").split(",") build_residence_count = item.get("BUILDZZNUM").split( ",") build_none_residence_count = item.get( "BUILDFZZNUM").split(",") # 该楼盘下所有大楼 for index in range(len(build_id) - 1): sale_building = build_name[index].replace("'", "") # 新增或查询大楼 building = get_building_sale_status( sale_building, real_estate_id) if building: building_id = building.get("id") else: building = Building() building.sale_building = sale_building building.web_build_id = int(build_id[index]) building.register_time = datetime.datetime.strptime( build_register[index], "%Y-%m-%d") building.sale_residence_count = int( build_residence_count[index]) building.sale_none_residence_count = int( build_none_residence_count[index]) building.source_id = WebSource.RealEstate building.real_estate_id = int(real_estate_id) building.total_count = 0 building.sale_count = 0 building.real_estate_name = real_estate.name building_id = building.__add__() # 查询该大楼出售情况,全部售完的就跳过 building_sale_result = get_building_sale_status( sale_building, real_estate_id) if building_sale_result and building_sale_result.get("total_count") != 0 and building_sale_result.get("sale_count") != 0 \ and building_sale_result.get("total_count") == building_sale_result.get("sale_count"): continue # 一栋楼里面的所有房子 driver_house = web_driver_manager.get_web_driver() houses_url = "http://www.cq315house.com/315web/HtmlPage/ShowRoomsNew.aspx?block=%s&buildingid=%s" %\ (sale_building.encode("utf8"), int(build_id[index])) driver_house.send_url(houses_url) house_soup = BeautifulSoup( driver_house.page_source, "html.parser") # 关闭网页 web_driver_manager.destory_web_driver( driver_house.get_id()) # 判断是否请求成功 if not house_soup.find( "img", attrs={"id": "projectInfo_img"}): continue # 预售许可证 pre_sale_number = json.loads( unquote( house_soup.find( "img", attrs={ "id": "projectInfo_img" }).attrs.get("src").split("text=") [1])).get("presaleCert") pre_sale_number = pre_sale_number.replace( "%u", "\\u").decode( "raw_unicode_escape").encode("utf-8") update_building(pre_sale_number, building_id) tbody = house_soup.find("table", attrs={ "id": "_mybuilding" }).find("tbody") trs = tbody.find_all("tr") # 单元列表 unit_td_list = house_soup.find_all( "input", attrs={"name": "unitb"}) unit_list = list() for unit_temp in unit_td_list: unit_list.append(unit_temp.next) # 是否新增了房子 is_add_house = False house_count_dict = div_list_return_dict( range(len(trs[0].find_all("td")) - 2), len(unit_list)) for tr in trs: tds = tr.find_all("td", attrs={"objt": "tdclass"}) for td_index, td in enumerate(tds): is_exception = False try: # 是不是房子 if "display:none" in td.attrs.get( "style").replace(" ", ""): continue # 单独每一套房子 # 单元号 house_unit = get_unit( house_count_dict, unit_list, td_index).encode("utf8").decode( "utf8").replace(" ", "") # 门牌号 door_number = td.find( "font").text.replace(" ", "") logger.info( "%s %s %s %s %s %s" % (datetime.datetime.now(), region.get("region").encode( "utf8").decode("utf8"), real_estate_name, sale_building, house_unit, door_number)) if not validate_house_door_number( door_number): continue # 出售状态 house_status_page = self.get_house_status_page( td) if house_status_page <= 0: # 没有获取到房间出售状态,跳过这间房间 continue # 查询数据库中房间是否已经售出 house_status = get_house_status( door_number, real_estate_id, building_id, house_unit) if house_status: # 已经售出跳过 # 状态改变改状态 if int(house_status.get("status") ) != house_status_page: update_house_status( house_status.get("id"), house_status_page) if not house_status.get( "web_house_id"): update_web_house_id( td.find("input").attrs.get( "value"), house_status.get("id")) continue is_add_house = True # 未售出房子 validate_url = "http://www.cq315house.com/315web/" + \ td.find("a").attrs.get("onclick").split("../")[1].split("');")[0] # 验证码 # self.get_internet_validate_code(validate_driver, validate_url) self.get_image(validate_driver, validate_url) one_house_soup = BeautifulSoup( validate_driver.page_source, "html.parser") if not one_house_soup.find("img"): raise BaseException(u"无法获取房子数据") one_house_data = unquote( one_house_soup.find( "img", attrs={ "id": "roomInfo_img" }).attrs.get("src").split( "text=")[1].replace( "%u", "\\u").decode( "unicode-escape")) if not one_house_data: raise BaseException(u"无法获取房子数据") if one_house_data and "undefined-undefined" in one_house_data: raise BaseException(u"无法获取房子数据") json_data = json.loads(one_house_data) if json_data.get("HX") == u"其他": continue house = House() house.door_number = door_number house.status = house_status_page house.inside_area = json_data.get( "TNMJ") house.built_area = json_data.get( "JZMJ") house.house_type = json_data.get("HX") house.inside_price = json_data.get( "NSDJ_TN") house.built_price = json_data.get( "NSDJ_JM") house.buliding_id = building_id house.real_estate_id = real_estate_id house.source_id = 1 house.unit = house_unit house.web_house_id = td.find( "input").attrs.get("value") house.__add__() logger.info("套内单价:%s, 套内面积:%s" % (house.inside_price, house.inside_area)) except BaseException as e1: # is_exception = True logger.error(u"内层") web_driver_manager.destory_web_driver( validate_driver.get_id()) logger.error(e1) validate_driver = web_driver_manager.get_web_driver( True) continue finally: if is_exception: update_region( region.get("id"), now_page) if is_add_house: # 增加大楼,楼房总量和在售数量 building_static_data = get_building_statictics_data( building_id, real_estate_id) update_building_count( building_id, building_static_data.get("total_count"), building_static_data.get("sale_count")) # 统计楼盘数据 static_data = get_real_estate_statictics_data( real_estate_id) update_real_estate_count( real_estate_id, static_data.get("sum(total_count)"), static_data.get("sum(sale_count)")) except BaseException as e2: logger.error(u"外层") logger.error(e2) continue finally: update_region(region.get("id"), now_page) update_region(region.get("id"), now_page)
def get_expression_code(self, ): """ 获得验证码 :return: """ expression = None try: # 识别图片 try: expression1 = self.get_internet_validate_code() except: expression1 = None logger.info(u"图片识别:%s" % expression1) # 图片修正识别 try: expression2 = self.image_corde_correct() except: expression2 = None logger.info(u"图片识别修正:%s" % expression2) # 图片比较识别 try: expression3 = self.compare_image_correct( operator_img_url=(self.base_image_path + "operator.png"), number1_img_url=(self.base_image_path + "num1.png"), number2_img_url=(self.base_image_path + "num2.png")) except: expression3 = None logger.info(u"图片比较识别:%s" % expression3) # 成功图片比较 try: expression4 = self.compare_success_img() except: expression4 = None logger.info(u"成功图片比较:%s" % expression4) if not (expression1 or expression2 or expression3): if expression4: expression = expression4 else: logger.info(u"图片识别失败") else: succ_size_expression1 = self.confirm_return_express( expression1, [expression2, expression3]) succ_size_expression2 = self.confirm_return_express( expression2, [expression1, expression3]) succ_size_expression3 = self.confirm_return_express( expression3, [expression1, expression2]) if succ_size_expression1 > succ_size_expression2: if succ_size_expression1 > succ_size_expression3: expression = expression1 else: expression = expression3 else: if succ_size_expression2 > succ_size_expression3: expression = expression2 else: expression = expression3 except BaseException as e: logger.info(e) logger.info(u"验证码:%s" % expression) if not expression: expression = 0 logger.error(u"错误") # 计算验证码 return expression, self.compute_code(expression)
def work(self): delete_logs() options = webdriver.ChromeOptions() options.add_argument("headless") web_driver_manager = WebDriverManager(1, "chrome", options) house_driver = web_driver_manager.get_web_driver(True) # 统计数据 buliding_id = 0 real_estate_id = 0 while True: try: house = pool.find_one(self.base_select_sql) if not house: logger.info(u"数据收集完成") break if not house.get("web_house_id"): continue house_driver.send_url( (self.base_house_url % house.get("web_house_id"))) # 截图整个网页 house_driver.save_screenshot(self.save_image_url) # 保存图片 img = house_driver.find_element_by_tag_name("img") location_img_url = self.save_image_url left = img.location.get("x") top = img.location.get("y") width = left + img.size.get("width") height = top + img.size.get("height") image = Image.open(location_img_url).crop( (left, top, width, height)) image.save(location_img_url) # 防止图片没有保存下来 time.sleep(3) # 识别图片 image_recognition = ImageRecognition(self.base_image_path, self.save_image_url) expression, int_code = image_recognition.get_expression_code() # 发送验证码请求 code_input = house_driver.find_element_by_id("txtCode") code_input.send_keys(int_code) house_driver.find_element_by_id("Button1").click() one_house_url = house_driver.current_url if "bid" in one_house_url: # 保存成功的图片 image_recognition.save_success_image( self.save_image_url, expression) # 收集数据 one_house_soup = BeautifulSoup(house_driver.page_source, "html.parser") if not one_house_soup.find("img"): raise BaseException(u"无法获取房子数据") one_house_data = unquote( one_house_soup.find( "img", attrs={ "id": "roomInfo_img" }).attrs.get("src").split("text=")[1].replace( "%u", "\\u").decode("unicode-escape")) if not one_house_data: raise BaseException(u"无法获取房子数据") if one_house_data and "undefined-undefined" in one_house_data: raise BaseException(u"无法获取房子数据") json_data = json.loads(one_house_data) # if json_data.get("HX") == u"其他": # continue house_status = chinese_status.get( json_data.get("FWZT")) if chinese_status.get( json_data.get("FWZT")) else 7 inside_area = json_data.get("TNMJ") built_area = json_data.get("JZMJ") house_type = json_data.get("HX") inside_price = json_data.get("NSDJ_TN") built_price = json_data.get("NSDJ_JM") pool.commit(self.base_update_sql, [ house_status, inside_area, built_area, house_type, inside_price, built_price, datetime.datetime.now(), house.get("id") ]) logger.info(u"thread:%s, %s:套内单价:%s, 套内面积:%s" % (self.thread_no, house.get("door_number"), inside_price, inside_area)) # 统计数据 # 不同大楼,此时统计该栋楼的数据 if buliding_id and buliding_id != house.get("buliding_id"): sql_count_house = """select * from (select count(1) as sale_number from house where buliding_id=%s and status=2) as a, (select count(1) as total_number from house where buliding_id=%s) as b, (select count(1) as sold_number from house where `status` in (3,4,5) and buliding_id=%s) as c""" result_count_house = pool.find_one( sql_count_house, [buliding_id, buliding_id, buliding_id], sql_analysis=False) sql_update_buliding = """update building set sale_residence_count=%s, total_count=%s, sale_count=%s, updated=%s where id=%s""" pool.commit(sql_update_buliding, [ result_count_house[0], result_count_house[1], result_count_house[2], datetime.datetime.now(), buliding_id ]) buliding_id = house.get("buliding_id") # 不同楼盘,此时统计楼盘数据 if real_estate_id and real_estate_id != house.get( "real_estate_id"): sql_count_buliding = """select sum(sale_residence_count), sum(total_count), sum(sale_count) from building where real_estate_id=%s""" result_count_buliding = pool.find_one( sql_count_buliding, [real_estate_id]) sql_update_real_estate = """update real_estate set sale_count=%s, house_total_count=%s, house_sell_out_count=%s, updated=%s where id=%s""" pool.commit(sql_update_real_estate, [ result_count_buliding.get( "sum(sale_residence_count)"), result_count_buliding.get("sum(total_count)"), result_count_buliding.get("sum(sale_count)"), datetime.datetime.now(), real_estate_id ]) real_estate_id = house.get("real_estate_id") if not buliding_id: buliding_id = house.get("buliding_id") real_estate_id = house.get("real_estate_id") except BaseException as e: logger.error(e) try: web_driver_manager.destory_web_driver( house_driver.get_id()) except BaseException as e2: print e2 command = u"taskkill /F /IM chromedriver.exe" os.system(command) house_driver = web_driver_manager.get_web_driver(True)