def generate_filename_from_url(self, url="", file_type=""): response_html = "" filename = "" filename_base = "" folder = self.list_html_folder_name now = datetime.datetime.now() today = now.strftime('%Y%m%d') url_fragments = url.split("/") while '' in url_fragments: url_fragments.remove('') # Examples: # http://www.dianping.com/chenzhou/ch10/g113 # http://www.dianping.com/shop/72457872 # http://www.dianping.com/shop/8910906/review_all/p624 if "list2" == file_type: if 3 < len(url_fragments): filename_base = f"{url_fragments[-3]}_{url_fragments[-2]}_{url_fragments[-1]}" response_html = f"{filename_base}_{today}.html" filename = response_html elif "detailed" == file_type: folder = self.detail_html_folder_name if 3 < len(url_fragments) and "review_all" == url_fragments[-2]: shop_id = CommonClass.find_digits_from_str(url_fragments[-3]) filename_base = f"shop_{shop_id}_{url_fragments[-1]}" response_html = f"{filename_base}_{today}.html" filename = response_html elif 2 < len(url_fragments): shop_id = CommonClass.find_digits_from_str(url_fragments[-1]) filename_base = f"shop_{shop_id}_p1" response_html = f"{filename_base}_{today}.html" filename = response_html elif "css" == file_type: # http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/a59454e0c1813952099c1e006c298195.css folder = self.svg_text_css_folder_name if 1 < len(url_fragments) and url_fragments[-1].endswith(".css"): filename_base = url_fragments[-1].replace(".css", "") response_html = url_fragments[-1] filename = response_html if response_html is None or 1 > len(response_html): rand_int = random.randint(100000, 999999) response_html = f"unknown{rand_int}_{today}.html" self.logger.error( f"File {response_html} is used to store html page crawled from {url}" ) return response_html, folder, filename, filename_base
def proxy_ip_pool(self): """ 迅联错误码10000 提取过快,请至少5秒提取一次 """ if "DRAGONFLY" == self.proxy_agent: return CommonClass.get_proxies(proxy_dict={}) now = time.time() need_new_proxy = False if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict): need_new_proxy = True elif "expire" not in self.proxy_ip_dict.keys(): need_new_proxy = True elif now + 3 > self.proxy_ip_dict["expire"]: need_new_proxy = True if need_new_proxy: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 1 > len(proxies_dict): return self.proxy_ip_dict # still return the old ip dict or {} proxies_dict["expire"] = now + random.randint( self.min_proxy_ip_life_time, self.max_proxy_ip_life_time) # set ip life time self.proxy_ip_dict = proxies_dict return self.proxy_ip_dict
def process_request(self, request, spider): if self.proxy_meta is None or not isinstance( self.proxy_meta, dict) or 1 > len(self.proxy_meta): self.proxy_meta = CommonClass.get_proxies(proxy_dict={}) if request.url.startswith("http://"): request.meta['proxy'] = self.proxy_meta['http'] elif request.url.startswith("https://"): request.meta['proxy'] = self.proxy_meta['https']
def extract_detailed_elements( self, response = None, city = "", house_id = "" ): text = {} # parse fields previously required big_box = response.css("div.item.fl") real_estate_name = response.css("div.name.fl div.cf h2::text").extract_first(default="") real_estate_slogan = big_box.css("div.hd.cf h1.Pagetitle::text").extract_first(default="") price_label = big_box.css("div.hd.cf h2.fl.yh.cf em.itemHeader::text").extract_first(default="") price_span_list = big_box.css("div.hd.cf h2.fl.yh.cf span.price::text").extract() price_span_money = big_box.css("div.hd.cf h2.fl.yh.cf span.price strong::text").extract_first(default="") if 2 == len( price_span_list ): price_str = f"{price_span_list[0]}___price___{price_span_money}___price___{price_span_list[1]}" else: price_str = "___price___".join(price_span_list) price_str = f"{price_str}___price___{price_span_money}" detail_lis = big_box.css( "ul.itemContent.itemContent3.pr li" ) items = [] for one_li in detail_lis: em_element = one_li.css( "em.itemHeader" ) if em_element is not None and 0 < len(em_element): item_value_list = one_li.css( "::text" ).extract() item_value = "" if 1 < len( item_value_list ): for index, value in enumerate(item_value_list): item_value_list[index] = value.strip() item_value += str("".join(item_value_list)) elif 1 == len( item_value_list ): item_value = item_value_list[0].strip() if "" != item_value: item_value = CommonClass.replace_string( string = item_value, char_to_remove = ['\r', '\n', '\t', ' ',], new_char = "___break___" ) items.append( item_value ) else: continue item_string = "" if 0 < len(items): item_string = "___descr___".join(items) if "" != item_string or "" != real_estate_name or "" != price_label or "" != price_str: text["real_estate_name"] = real_estate_name text["real_estate_slogan"] = real_estate_slogan text["price_label"] = price_label text["price_str"] = price_str text["item_string"] = item_string text["city"] = city text["house_id"] = house_id # parse fields required on 20190528 basic_info_box = response.css("div#xxIntr ul.hdl.ft") all_lis = basic_info_box.xpath("./li") item_list = [] for one_li in all_lis: key = one_li.xpath("./span/text()").extract_first(default="") value = one_li.xpath("./p/text()").extract_first(default="") if 0 < len( key ) and 0 < len( value ): item_list.append( f"{key}___key2value___{value}" ) if 0 < len( item_list ): text["basic_info"] = "___basic___".join( item_list ) return text
def extract_link_list(self, response=None): record_list = [] tr_list = response.xpath('//table[@class="resultTableC"]/tbody/tr') for one_tr in tr_list: try: detailed_page_link = one_tr.xpath( './tr/td/a/@href').extract_first(default="") detailed_page_link = CommonClass.clean_string( string=detailed_page_link, char_to_remove=[ '\r', '\n', '\t', ' ', ]) td_list = one_tr.xpath('./td') value_list = [] for one_td in td_list: value_list.append( one_td.xpath("./a/text()").extract_first(default="")) # 检查这7个字段是否都是空字符串 if 7 == len(value_list): not_empty = False for one_value in value_list: if isinstance(one_value, str) and 0 < len(one_value): not_empty = True break if 7 == len(value_list) and not_empty: this_record = { "序号": value_list[0], "项目名称": value_list[1], "开发商": value_list[2], "预售证": value_list[3], "项目地址": value_list[4], "住宅已售套数": value_list[5], "住宅未售套数": value_list[6], "详情链接": detailed_page_link, } record_list.append(this_record) elif 7 != len(value_list): error_msg = f"value_list ({value_list}) has length other than 7" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) except Exception as ex: error_msg = f"xpath error! Exception = {ex}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) if 1 > len(record_list): error_msg = f"Fail to extract links from {response.url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return record_list
def get_page_from_url(self, url=""): page_num = 0 url_obj = parse.urlparse(url) if hasattr(url_obj, "path"): url_list = url_obj.path.split("/") for one in url_list: if 0 == one.find("pn"): page_num = CommonClass.find_digits_from_str( string=one, return_all=False) return int(page_num)
def start_requests(self): self.init_self_attributes() self.make_dirs() if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request( url = url, callback = self.read_and_parse ) elif "PRODUCTION_RUN" == self.run_purpose: urls = [ # "http://www.cnemc.cn/sssj/", # 中国环境监测总局,实时数据页面 self.base_url, ] meta_dict = {} if self.use_proxy: proxies_dict = self.proxy_ip_pool() if 1 > len( proxies_dict): sys.exit(3) meta_dict["proxy"] = proxies_dict["http"] formdata_dict = {} # 没有任何表单字段需要post给目标网站 for url in urls: # yield scrapy.RequestForm( url = url, callback = self.parse_json, meta = meta_dict, dont_filter = True ) # yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True ) self.last_request_time = time.time() yield scrapy.FormRequest( url = url, formdata = formdata_dict, callback = self.parse_json, meta = meta_dict, dont_filter = True ) elif "CHECK_PROXY_IP" == self.run_purpose: now = int(time.time()) token = f"Guangzhou{str(now)}" m = hashlib.md5() m.update( token.encode(encoding = 'utf-8') ) urls = [ f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}", ] if "DRAGONFLY" == self.proxy_agent: proxies_dict = CommonClass.get_proxies( proxy_dict = {} ) else: proxies_dict = ProxyAgent.get_xunlian_proxy_dict(headers = {}, params_for_proxy_ip={}, setup_xunlian_dict = {}, need_setup_xunlian = False, logger=self.logger ) if 0 < len( proxies_dict): meta_dict = { "proxy": proxies_dict["http"] } for url in urls: yield scrapy.Request( url=url, callback=self.do_nothing_for_debug, meta = meta_dict ) else: self.logger.error( f"Error! No proxy ip returns. {proxies_dict}" ) else: urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] for url in urls: yield scrapy.Request( url=url, callback=self.do_nothing_for_debug )
def replace_one_node_text(self, node=None, this_node_class_name20190505=""): if node is None: return "" this_node_class_name = node.xpath("./@class").extract_first(default="") # the following 7 lines are for updated anticrawl methods on 20190505 this_node_get_text = node.get() if this_node_get_text is not None and 0 < len(this_node_get_text): this_node_get_text5 = this_node_get_text.encode( 'unicode_escape').decode('utf-8') if 6 == len(this_node_get_text5) and '\\' == this_node_get_text5[ 0] and 'u' == this_node_get_text5[ 1] and -1 < this_node_class_name20190505.find( "shopNum"): key = this_node_get_text5[2:] if key in self.database_anticrawl20190505_table.keys(): # self.logger.warning( f"{this_node_get_text5} ==> {key}; found in {self.database_anticrawl20190505_table[ key ]}" ) return self.database_anticrawl20190505_table[key] # has no class as shopNum: ¥ ==> \uffe5 not_in_class_mapping_dict = False for index, key in enumerate(self.class_mapping_dict): this_dict = self.class_mapping_dict[key] key_length = this_dict['key_length'] all_keys = this_dict['all_keys'] if key_length < len( this_node_class_name ) and this_node_class_name[:key_length] in all_keys: value = this_dict['class_mapping'][ this_node_class_name] if this_node_class_name in this_dict[ 'class_mapping'].keys() else "" if 0 < len(value): return value else: not_in_class_mapping_dict = True self.logger.error( f"cannot find {this_node_class_name} in saved mapping class {key}." ) if not_in_class_mapping_dict: return "" else: temp = CommonClass.clean_string(string=node.get(), char_to_remove=[ '\r', '\n', '\t', ' ', ]) return temp
def find_more_house_ids(self, doc = ""): house_id_list = [] counter = 0 index = 0 while True: index = doc.find("data-hid", index) if -1 == index: break sub_doc = doc[index+10:index+25] house_id_list.append( CommonClass.find_digits_from_str( sub_doc ) ) index += 10 counter += 1 return house_id_list
def get_page_area_district_from_url(self, url_object=None): """ https://fs.58.com/shangpucz/ https://gz.58.com/shangpu/ https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50 https://fs.58.com/foshan/shangpucz/pn2/ # foshan == 佛山周边,与禅城、高明、三水等同级 https://gz.58.com/shangpucz/pn3/ https://fs.58.com/shangpu/38143746902823x.shtml """ page = "1" district = "" shop_area = "" detailed_page = False if url_object is not None and hasattr( url_object, "netloc") and -1 < url_object.netloc.find("58.com"): # parse query has_shop_area = True if not hasattr(url_object, "query") or 1 > len(url_object.query): has_shop_area = False if has_shop_area: query_dict = parse.parse_qs(url_object.query) if "area" in query_dict.keys() and isinstance( query_dict["area"], list) and 0 < len(query_dict["area"]): shop_area = query_dict["area"][0] # parse path if hasattr(url_object, "path"): url_list = url_object.path.split("/") temp_list = [] for one in url_list: if 0 < len(one) and -1 == one.find( "shangpucz") and -1 == one.find( "shangpu") and -1 == one.find("pn"): temp_list.append(one) elif -1 < one.find("pn"): page = CommonClass.find_digits_from_str( string=one, return_all=False) elif -1 < one.find(".shtml"): detailed_page = True if not detailed_page and 1 == len(temp_list): district = temp_list[0] if detailed_page: page = "0" return (page, district, shop_area)
def read_json_and_parse(self, response): file_list = os.listdir(self.json_dir) # route0___0___20190615_234522.json for one_file in file_list: temp_list = one_file.split("___") preset_route = 0 now = "" if 2 < len(temp_list): preset_route = temp_list[0] preset_route = preset_route.lstrip("route") preset_route = CommonClass.find_digits_from_str( string=preset_route, return_all=False) preset_route = int(preset_route) now = temp_list[2] now = now.rstrip(".json") url = self.get_url_according_to_preset_route( preset_route=preset_route) json_file_path = os.path.join(self.json_dir, one_file) if os.path.isfile(json_file_path): try: doc = None with open(json_file_path, "rb") as f: doc = f.read().decode("utf-8", "ignore") if doc is None: self.logger.error( f"Error: cannot read html file {json_file_path}." ) continue text_dict = self.extract_text_dict_from_response_body( body=doc, preset_route=preset_route, now=now) if 0 < len(text_dict): json_selector = Selector(text=doc, type=None) loader = ItemLoader(item=DirectionbaiduItem(), selector=json_selector) loader = self.load_items_into_loader( loader=loader, text=text_dict, url=url, now=now) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}" )
def init_self_attributes(self): self.run_purpose = self.settings.get(name='RUN_PURPOSE', default=None) # set all paths self.root_path = self.settings.get('PROJECT_PATH') self.crawled_folder_name = self.settings.get(name='CRAWLED_DIR', default='crawled') self.detail_html_folder_name = self.settings.get( name='SAVED_DETAIL_HTML', default='detail_html') self.list_html_folder_name = self.settings.get(name='SAVED_LIST_HTML', default='list_html') self.svg_text_css_folder_name = self.settings.get(name='SVG_TEXT_CSS', default='svgtextcss') if self.run_purpose in [ "PARSE_FIDDLER", "PARSE_DETAILED_HOTEL", ]: self.detail_html_folder_name = f"{ self.detail_html_folder_name }_fiddler" self.list_html_folder_name = f"{ self.list_html_folder_name }_fiddler" self.svg_text_css_folder_name = f"{ self.svg_text_css_folder_name }_fiddler" # whether this run is for debugging self.debug = self.settings.get(name='PROJECT_DEBUG', default=False) self.move_fiddler_file = self.settings.get( name='MOVE_FIDDLER_HTML_FILE', default=True) # get proxy header temp = CommonClass.get_proxies(proxy_dict={}) self.proxy_meta = temp['http'] self.database_city_district_table = self.settings.get( name='DATABASE_CITY_DISTRICT_TABLE', default={}) self.database_level2name_table = self.settings.get( name='DATABASE_LEVEL2NAME_TABLE', default={}) self.database_merchant_star_level_table = self.settings.get( name='DATABASE_MERCHANT_STAR_LEVEL_TABLE', default={}) self.database_anticrawl20190505_table = self.settings.get( name='DATABASE_ANTICRAWL20190505_TABLE', default={}) self.database_common_channel_list_table = self.settings.get( name='DATABASE_COMMON_CHANNEL_LIST_TABLE', default=[])
def extract_all_detailed_html_links(self, string = ""): house_id_list = [] if 1 > len( string ): return house_id_list doc = string.decode('utf-8') end_string = '";var search_result_list_num =' end_pos = len( doc ) if -1 < doc.find( end_string ): end_pos = doc.find( end_string ) doc = doc[ len('var search_result = " '):end_pos ] doc = '<!DOCTYPE html><html><head lang="zh-cn"><title>腾讯房产列表</title></head><body>' + f"{doc}</body></html>" response = Selector( text=doc, type="html" ) house_id_list = response.xpath("//div/@data-hid").extract() if 10 > len( house_id_list ): house_id_list = self.find_more_house_ids( doc = doc ) else: temp_list = [] for one_id in house_id_list: temp_list.append( CommonClass.find_digits_from_str( one_id ) ) house_id_list = temp_list return house_id_list
def __init__(self, root_path="", css_file="", css_string = "", send_requests=False, referer=None, save_requested_svg=True, csv_file=None, settings = None, folder="", logger=None): # read fiddler self.settings = None if settings is None else settings temp = self.settings.get( name = "RUN_PURPOSE", default=None ) self.read_fiddler = False if "PARSE_FIDDLER" == temp: self.read_fiddler = True self.root_path = os.getcwd() if root_path is None or 1 > len( root_path) else root_path self.folder = "list_html" if folder is None or 1 > len( folder ) else folder self.spider_name = self.settings.get( "SPIDER_NAME" ) if self.settings is not None else "" self.svg_css_folder_name = self.settings.get( "SVG_TEXT_CSS" ) if self.settings is not None else "" if self.read_fiddler: self.svg_css_folder_name = f"{ self.svg_css_folder_name }_fiddler" self.css_file = "" if css_file is None or 1 > len( css_file) else css_file if self.css_file is not None and 0 < len( self.css_file ): self.css_file_path = os.path.join( self.root_path, self.spider_name, self.svg_css_folder_name, self.css_file ) self.css_string = "" if css_string is None or 1 > len( css_string ) else css_string self.send_requests = False if send_requests is None else send_requests self.referer = None if referer is None or 1 > len( referer ) else referer self.save_requested_svg = True if save_requested_svg is None else save_requested_svg self.csv_file = "" if csv_file is None or 1 > len( csv_file ) else csv_file self.logger = None if logger is None else logger if self.logger is None: print( f"please pass the logger!" ) sys.exit(2) self.use_proxy = True if self.settings.get( "HTTPPROXY_ENABLED" ) else False proxy_dict = CommonClass.get_proxies( proxy_dict = {} ) self.proxies = proxy_dict['http'] self.svg_files = {} self.svg_urls = {} self.svg_file_dict = {} self.svg_file_contents = {} self.payload = {} self.class_mapping = {} self.class_mapping_updated = False self.key_length = 0
def parse_one_bus_route_fields(self, response=None, city_str="", route_str=""): if response is None: return {} try: url = response.url url_obj = parse.urlparse(url) bus_route_id = url_obj.path.strip("/") bus_line_div = response.xpath("//div[@id='bus_line']") bus_line_information_div = bus_line_div.xpath( "./div[@class='bus_line_information ']/div[@class='bus_i_content']" ) bus_route_title = bus_line_information_div.xpath( "./div[@class='bus_i_t1']/h1/text()").extract_first(default="") bus_route_title = CommonClass.clean_string(string=bus_route_title, char_to_remove=[ ' ', ' ', '\xa0', ' ', ]) bus_route_district = bus_line_information_div.xpath( "./div[@class='bus_i_t1']/a[@class='bus_i_t2']/text()" ).extract_first(default="") bus_route_info_list = bus_line_information_div.xpath( "./p[@class='bus_i_t4']/text()").extract() bus_route_info_str = "" if 0 < len(bus_route_info_list): bus_route_info_str = "___".join(bus_route_info_list) bus_operation_interval_str = bus_line_div.xpath( "./div[@class='bus_label ']/p[@class='bus_label_t2']/text()" ).extract_first(default="") bus_direction_dict = {} all_way_div_list = bus_line_div.xpath( "./div[@class='bus_line_top ']") for index, one_way_div in enumerate(all_way_div_list): one_way_name_text_list = one_way_div.xpath( "./div/strong/text()").extract() one_way_name = "___".join(one_way_name_text_list) if 0 < len( one_way_name_text_list) else "" span_text_list = one_way_div.xpath("./span/text()").extract() one_way_stop_number = "___".join( span_text_list) if 0 < len(span_text_list) else "" if 0 < len(one_way_stop_number): one_way_stop_number = CommonClass.clean_string( string=one_way_stop_number, char_to_remove=[ ' ', ' ', '\xa0', ]) temp_dict = { "one_way_name": one_way_name, "one_way_stop_number": one_way_stop_number, } bus_direction_dict[index] = temp_dict bus_route_stop_round_trip_list = bus_line_div.xpath( "./div[@class='bus_line_site ']") for index, one_direction in enumerate( bus_route_stop_round_trip_list): stop_sequence_list = one_direction.xpath( "./div[@class='bus_site_layer']/div/i/text()").extract() stop_name_list = one_direction.xpath( "./div[@class='bus_site_layer']/div/a/text()").extract() if len(stop_name_list) == len(stop_sequence_list): temp_list = [] for stop_name_index, stop_name in enumerate( stop_name_list): temp_list.append( f"{stop_sequence_list[stop_name_index]}___{stop_name}" ) if index in bus_direction_dict.keys(): bus_direction_dict[index]["stops"] = temp_list else: bus_direction_dict[index] = {"stops": temp_list} text_dict = { "route_title": bus_route_title.strip(), "city": city_str, "route_name": route_str, "route_id": bus_route_id.strip(), "route_uri": url, "route_district": bus_route_district.strip(), "route_info": bus_route_info_str.strip(), "operation_interval": bus_operation_interval_str.strip(), "bus_directions": bus_direction_dict, } return text_dict except Exception as ex: error_msg = f"Error happened during parsing. Exception = {ex}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return {}
class Bus8684Spider(scrapy.Spider): """ sys.exit code == 1 # missing BUS8684_CITY_LIST """ name = "bus8684" root_path = "" log_dir = "" debug = False bus8684_city_list = [] save_every_response = False crawled_dir = "" saved_html_dir = "" gaode_json_dir = "" output_folder_name = "" base_uri = "" run_purpose = None custom_settings = CommonClass.get_custom_settings_dict(spider=name) def init_self_attributes(self): self.root_path = self.settings.get("PROJECT_PATH") self.log_dir = self.settings.get(name="LOG_DIR", default="") self.debug = self.settings.get(name="PROJECT_DEBUG", default=False) self.bus8684_city_list = self.settings.get("BUS8684_CITY_LIST", default=[]) if 1 > len(self.bus8684_city_list): self.logger.error( f"missing BUS8684_CITY_LIST ({self.bus8684_city_list}) setting" ) sys.exit(1) self.save_every_response = self.settings.get( name="SAVE_EVERY_RESPONSE", default=False) self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="") self.saved_html_dir = self.settings.get(name="SAVED_HTML", default="") self.gaode_json_dir = self.settings.get(name="SAVED_GAODE_JASON", default="") self.output_folder_name = self.settings.get(name="OUTPUT_FOLDER_NAME", default="") self.base_uri = self.settings.get(name="BASE_URI", default="") self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None) self.maximal_requests_of_one_crontab_process = self.settings.get( name="MAXIMAL_REQUESTS_OF_ONE_CRONTAB_PROCESS", default=23) self.interval_between_requests = self.settings.get( name="INTERVAL_BETWEEN_REQUESTS", default=300) def check_dirs_and_files(self): if not os.path.isdir(self.crawled_dir): os.makedirs(self.crawled_dir) if not os.path.isdir(self.saved_html_dir): os.makedirs(self.saved_html_dir) if not os.path.isdir(self.gaode_json_dir): os.makedirs(self.gaode_json_dir) def start_requests(self): """ 0 == index_level and "index" == page_type: https://guangzhou.8684.cn/ 1 == index_level: https://guangzhou.8684.cn/list1 # list1 page displays links of Bus Route #1, #10, #175 and so on "detailed" == page_type: https://guangzhou.8684.cn/x_8234e473 # this one is Bus Route 10 detailed page """ self.init_self_attributes() self.check_dirs_and_files() if "PRODUCTION_RUN" == self.run_purpose: number_day_of_this_year = datetime.datetime.now().timetuple( ).tm_yday # type == int seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3) if seperate_into_days > len(self.bus8684_city_list): seperate_into_days = len(self.bus8684_city_list) batch_count = math.ceil( len(self.bus8684_city_list) / seperate_into_days) today_batch = number_day_of_this_year % seperate_into_days start_index = today_batch * batch_count - 1 end_index = (today_batch + 1) * batch_count urls = [] for index, city in enumerate(self.bus8684_city_list): if (start_index < index) and (index < end_index): urls.append(f"https://{city}.{self.base_uri}") meta_dict = { "page_type": "index", "index_level": 0, } for url in urls: yield scrapy.Request(url=url, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif "READ_HTML" == self.run_purpose: url = "http://quotes.toscrape.com/page/1/" yield scrapy.Request(url=url, callback=self.debug_one_method, meta={}, dont_filter=True) def extract_links_from_list_page(self, response=None, city="", index_level_int=0, route_str=""): urls = [] if index_level_int not in [ 0, 1, ]: return urls if 0 == index_level_int: digit_href_list = response.xpath( "//div[@class='bus_kt_r1']/a/@href").extract() letter_href_list = response.xpath( "//div[@class='bus_kt_r2']/a/@href").extract() all_link_list = digit_href_list + letter_href_list for one_link in all_link_list: urls.append(f"https://{city}.{self.base_uri}{one_link}") # https://guangzhou.8684.cn/list1 # one_link == "/list1", "/listB" return urls # 1 == index_level_int route_href_list = response.xpath( "//div[@id='con_site_1']/a/@href").extract() route_text_list = response.xpath( "//div[@id='con_site_1']/a/text()").extract() if len(route_href_list) != len(route_text_list): error_msg = f"length of route_href_list ({len(route_href_list)}) != length of route_text_list ({len(route_text_list)})" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) route_text_list = [] for index, one_link in enumerate(route_href_list): temp_dict = { "url": f"https://{city}.{self.base_uri}{one_link}", # https://guangzhou.8684.cn/x_face82cc # one_link == "/x_f2148667", "/x_a72d3ade" "route": route_text_list[index] if index < len(route_text_list) else 0, } urls.append(temp_dict) return urls def parse_list_page(self, response=None): write_result_int, city, page_type, index_level, route_str = self.save_html( response=response) if -1 == write_result_int or "index" != page_type or index_level not in [ 0, 1, ]: return False urls = self.extract_links_from_list_page(response=response, city=city, index_level_int=index_level, route_str=route_str) if 0 == index_level: meta_dict = { "page_type": "index", "index_level": 1, } for one_url in urls: yield scrapy.Request(url=one_url, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) else: meta_dict = { "page_type": "detailed", "index_level": -1, } for one_url_dict in urls: meta_dict["route"] = one_url_dict[ "route"] if "route" in one_url_dict.keys( ) else "unknown_route" one_url = one_url_dict["url"] if "url" in one_url_dict.keys( ) else "" if 0 < len(one_url): yield scrapy.Request(url=one_url, callback=self.parse_detailed_page, meta=meta_dict, dont_filter=True) else: error_msg = f"wrong one_url_dict ({one_url_dict})" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) def get_city_from_url(self, url=""): city = "" result_obj = parse.urlparse(url) if -1 < result_obj.netloc.find(self.base_uri): temp2_list = result_obj.netloc.split(".") if 3 == len(temp2_list): city = temp2_list[0] return city def make_html_file_name(self, response=None, city="", index_level=-1): """ https://guangzhou.8684.cn/ https://guangzhou.8684.cn/list1, https://guangzhou.8684.cn/listH https://guangzhou.8684.cn/x_8234e473, https://guangzhou.8684.cn/x_1ed58fbc response already has url and meta attributes """ now = datetime.datetime.now() html_filename_str = now.strftime("%Y%m%d_%H%M%S") today = now.strftime("%Y%m%d") url = response.url meta_dict = response.meta result_obj = parse.urlparse(url) url_path_list = result_obj.path.split("/") while "" in url_path_list: url_path_list.remove("") detailed_page_bool = False route_str = "" if 0 == len(url_path_list) and 0 == index_level: html_filename_str = f"{city}___index{index_level}___route_all___{today}.html" elif 1 == len(url_path_list): if -1 < url_path_list[0].find("list"): route_str = url_path_list[0].lstrip("list") if 1 > len(route_str): route_str = "unknown" html_filename_str = f"{city}___index{index_level}___route_{route_str}___{today}.html" elif -1 < url_path_list[0].find("x_"): detailed_page_bool = True route_str = str(meta_dict["route"] ) if "route" in meta_dict.keys() else "unknown" # has Chinese and Entrobus32 does not accept file name including Chinese route_id = url_path_list[0] html_filename_str = f"{city}___detailed___route_{route_id}___{today}.html" else: html_filename_str = f"{city}___unknown___route_unknown___{html_filename_str}.html" return (detailed_page_bool, route_str, html_filename_str) def save_html(self, response=None): """ returns -1: wrong response object -2: fail to write response.body 1001: this is a detailed page 101: more than 69 pages 0 to 70: page number; 0:detailed page or fail to extract total page from list page """ if response is None or not hasattr(response, "meta") or not hasattr( response, "body") or not hasattr(response, "url"): if hasattr(response, "url"): error_msg = f"fail to save response.body after requesting {response.url}; response has no body or meta attribute(s)" else: error_msg = f"fail to save response.body response has no url attribute and may have no body and / or meta attribute(s)" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return (-1, "", "", -1, "") url = response.url meta_dict = response.meta page_type = "index" index_level = -1 route_str = "" city = self.get_city_from_url(url=url) html_file_path = "" if "page_type" in meta_dict.keys(): page_type = meta_dict["page_type"] if "index_level" in meta_dict.keys(): index_level = meta_dict["index_level"] index_level = CommonClass.safely_convert_to_int( to_int_obj=index_level, spider_obj=self, convert_strategy="match_all_digits") if index_level is None: index_level = -1 elif "index" == page_type: error_msg = f"index_level is NOT in meta_dict.keys(); and page has NOT been saved after requesting {url} " self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) if "index" == page_type and -1 < index_level: detailed_page_bool, route_str, html_filename_str = self.make_html_file_name( response=response, city=city, index_level=index_level) html_file_path = os.path.join(self.saved_html_dir, html_filename_str) elif "detailed" == page_type: detailed_page_bool, route_str, html_filename_str = self.make_html_file_name( response=response, city=city, index_level=index_level) html_file_path = os.path.join(self.saved_html_dir, html_filename_str) try: with open(html_file_path, "wb") as f: f.write(response.body) except Exception as ex: error_msg = f"fail to write response.body into {html_file_path} after requesting {url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return (0, city, page_type, index_level, route_str) return (1, city, page_type, index_level, route_str) def parse_one_bus_route_fields(self, response=None, city_str="", route_str=""): if response is None: return {} try: url = response.url url_obj = parse.urlparse(url) bus_route_id = url_obj.path.strip("/") bus_line_div = response.xpath("//div[@id='bus_line']") bus_line_information_div = bus_line_div.xpath( "./div[@class='bus_line_information ']/div[@class='bus_i_content']" ) bus_route_title = bus_line_information_div.xpath( "./div[@class='bus_i_t1']/h1/text()").extract_first(default="") bus_route_title = CommonClass.clean_string(string=bus_route_title, char_to_remove=[ ' ', ' ', '\xa0', ' ', ]) bus_route_district = bus_line_information_div.xpath( "./div[@class='bus_i_t1']/a[@class='bus_i_t2']/text()" ).extract_first(default="") bus_route_info_list = bus_line_information_div.xpath( "./p[@class='bus_i_t4']/text()").extract() bus_route_info_str = "" if 0 < len(bus_route_info_list): bus_route_info_str = "___".join(bus_route_info_list) bus_operation_interval_str = bus_line_div.xpath( "./div[@class='bus_label ']/p[@class='bus_label_t2']/text()" ).extract_first(default="") bus_direction_dict = {} all_way_div_list = bus_line_div.xpath( "./div[@class='bus_line_top ']") for index, one_way_div in enumerate(all_way_div_list): one_way_name_text_list = one_way_div.xpath( "./div/strong/text()").extract() one_way_name = "___".join(one_way_name_text_list) if 0 < len( one_way_name_text_list) else "" span_text_list = one_way_div.xpath("./span/text()").extract() one_way_stop_number = "___".join( span_text_list) if 0 < len(span_text_list) else "" if 0 < len(one_way_stop_number): one_way_stop_number = CommonClass.clean_string( string=one_way_stop_number, char_to_remove=[ ' ', ' ', '\xa0', ]) temp_dict = { "one_way_name": one_way_name, "one_way_stop_number": one_way_stop_number, } bus_direction_dict[index] = temp_dict bus_route_stop_round_trip_list = bus_line_div.xpath( "./div[@class='bus_line_site ']") for index, one_direction in enumerate( bus_route_stop_round_trip_list): stop_sequence_list = one_direction.xpath( "./div[@class='bus_site_layer']/div/i/text()").extract() stop_name_list = one_direction.xpath( "./div[@class='bus_site_layer']/div/a/text()").extract() if len(stop_name_list) == len(stop_sequence_list): temp_list = [] for stop_name_index, stop_name in enumerate( stop_name_list): temp_list.append( f"{stop_sequence_list[stop_name_index]}___{stop_name}" ) if index in bus_direction_dict.keys(): bus_direction_dict[index]["stops"] = temp_list else: bus_direction_dict[index] = {"stops": temp_list} text_dict = { "route_title": bus_route_title.strip(), "city": city_str, "route_name": route_str, "route_id": bus_route_id.strip(), "route_uri": url, "route_district": bus_route_district.strip(), "route_info": bus_route_info_str.strip(), "operation_interval": bus_operation_interval_str.strip(), "bus_directions": bus_direction_dict, } return text_dict except Exception as ex: error_msg = f"Error happened during parsing. Exception = {ex}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return {} def debug_one_method(self, response): file_name = "guangzhou___detailed___route_405路(2019年7月13日起调整)___20190622.html" file_name = "guangzhou___detailed___route_花6路___20190622.html" html_dir = os.path.join(self.root_path, self.name, self.output_folder_name, "20190622html") file_path = os.path.join(html_dir, file_name) if os.path.isfile(file_path): doc = None try: with open(file_path, "rb") as html_file: doc = html_file.read().decode("utf-8", "ignore") except Exception as ex: self.logger.error( f"Error: cannot read html file {file_path}. Exception = {ex}" ) return False if doc is None: self.logger.error(f"Error: cannot read html file {file_path}.") return False url = "https://guangzhou.8684.cn/x_f2148667" response_for_items = TextResponse(url=url, status=200, body=bytes(doc, encoding="utf-8")) write_result_int, city, page_type, index_level, route_str = self.save_html( response=response_for_items) text_dict = self.parse_one_bus_route_fields( response=response_for_items, city_str=city, route_str=route_str) text_dict["city"] = "guangzhou" text_dict["route_name"] = "花6路" if 0 < len(text_dict): try: loader = ItemLoader(item=Bus8684Item(), response=response_for_items) loader = self.load_items_into_loader(loader=loader, text=text_dict, url=url) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}" ) def parse_detailed_page(self, response): write_result_int, city, page_type, index_level, route_str = self.save_html( response=response) text_dict = self.parse_one_bus_route_fields(response=response, city_str=city, route_str=route_str) # self.logger.debug( f"write_result_int, city, page_type, index_level, route_str = {(write_result_int, city, page_type, index_level, route_str)}" ) if 0 < len(text_dict): self.logger.info( f"After requesting {response.url}, good response is received.") try: loader = ItemLoader(item=Bus8684Item(), response=response) loader = self.load_items_into_loader(loader=loader, text=text_dict, url=response.url) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}" ) def load_items_into_loader(self, loader=None, text={}, url=""): loader.add_value("url", url) loader.add_value("project", self.settings.get("BOT_NAME")) loader.add_value("spider", self.name) loader.add_value("server", socket.gethostname()) loader.add_value("date", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) loader.add_value("content", str(text)) loader.add_value("page_type", "detailed") return loader
def save_html(self, response=None): """ returns -1: wrong response object -2: fail to write response.body 1001: this is a detailed page 101: more than 69 pages 0 to 70: page number; 0:detailed page or fail to extract total page from list page """ if response is None or not hasattr(response, "meta") or not hasattr( response, "body") or not hasattr(response, "url"): if hasattr(response, "url"): error_msg = f"fail to save response.body after requesting {response.url}; response has no body or meta attribute(s)" else: error_msg = f"fail to save response.body response has no url attribute and may have no body and / or meta attribute(s)" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return (-1, "", "", -1, "") url = response.url meta_dict = response.meta page_type = "index" index_level = -1 route_str = "" city = self.get_city_from_url(url=url) html_file_path = "" if "page_type" in meta_dict.keys(): page_type = meta_dict["page_type"] if "index_level" in meta_dict.keys(): index_level = meta_dict["index_level"] index_level = CommonClass.safely_convert_to_int( to_int_obj=index_level, spider_obj=self, convert_strategy="match_all_digits") if index_level is None: index_level = -1 elif "index" == page_type: error_msg = f"index_level is NOT in meta_dict.keys(); and page has NOT been saved after requesting {url} " self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) if "index" == page_type and -1 < index_level: detailed_page_bool, route_str, html_filename_str = self.make_html_file_name( response=response, city=city, index_level=index_level) html_file_path = os.path.join(self.saved_html_dir, html_filename_str) elif "detailed" == page_type: detailed_page_bool, route_str, html_filename_str = self.make_html_file_name( response=response, city=city, index_level=index_level) html_file_path = os.path.join(self.saved_html_dir, html_filename_str) try: with open(html_file_path, "wb") as f: f.write(response.body) except Exception as ex: error_msg = f"fail to write response.body into {html_file_path} after requesting {url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return (0, city, page_type, index_level, route_str) return (1, city, page_type, index_level, route_str)
def parse_detailed_response_field(self, response=None, city="", apt_id=""): text = {} if response is None: return text if "READ_HTML" == self.run_purpose and not isinstance( response, Selector): return text title = response.xpath("//div[@id='lpname']/h1/text()").extract_first( default="") if 1 > len(title): title = response.xpath( "//div[@class='tab-cont clearfix']/div[@class='title rel']/h1[@class='title floatl']/text()" ).extract_first(default="") title_right_box = response.xpath("//div[@class='tab-cont-right']") price_div = title_right_box.xpath( "./div[@class='tr-line clearfix zf_new_title']/div[@class='trl-item_top']/div[@class='rel floatl']/preceding-sibling::div" ) price_list = price_div.xpath("string(.)").extract() price = "___".join(price_list) # extract features feature_div = title_right_box.xpath( "./div[@class='tr-line clearfix']/div[contains(@class,'trl-item1')]" ) feature_dict = {} for one_item in feature_div: key = one_item.xpath( "./div[@class='font14']/text()").extract_first(default="") value = one_item.xpath("./div[@class='tt']/text()").extract_first( default="") if 0 < len(key): feature_dict[key] = CommonClass.clean_string(string=value, char_to_remove=[ '\r', '\n', '\t', ' ', ]) # extract location information location_div = title_right_box.xpath( "./div[@class='tr-line']/div[@class='trl-item2 clearfix']") location_dict = {} for one_location in location_div: key = one_location.xpath( "./div[@class='lab']/text()").extract_first(default="") value_list = one_location.xpath( "string(./div[@class='rcont'])").extract() temp_list = [] for one_value in value_list: temp = CommonClass.clean_string(string=one_value, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) temp_list.append(temp.strip('\r')) # keep \r if 0 < len(key): key = CommonClass.clean_string(string=key, char_to_remove=[ '\u2003', '\xa0', '\n', '\t', ' ', ]) location_dict[key] = "___".join(temp_list) information_box = response.xpath( "//div[@class='content-item fydes-item']") information_title_list = information_box.xpath( "string(./div[@class='title'])").extract() information_title = "___".join( information_title_list) if 0 < len(information_title_list) else "" information1div = information_box.xpath( "./div[@class='cont clearfix']/div[@class='text-item clearfix']") information_dict = {} for one_item in information1div: key = one_item.xpath("./span[@class='lab']/text()").extract_first( default="") value_list = one_item.xpath( "string(./span[@class='rcont'])").extract() temp_list = [] for one_value in value_list: temp = CommonClass.clean_string(string=one_value, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) temp_list.append(temp.strip('\r')) if 0 < len(key): information_dict[key] = "___".join(temp_list) community_box1 = response.xpath("//div[@id='xq_message']") community_title = community_box1.xpath("./text()").extract_first( default="") community_title = CommonClass.clean_string(string=community_title, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) community_dict = { "title": community_title.strip('\r'), } community_box2 = community_box1.xpath("./following-sibling::div") community_box2line1 = community_box2.xpath( "./div[@class='topt clearfix']") line1_list = community_box2line1.xpath( "./div[@class='text-item clearfix']") for one_item in line1_list: key = one_item.xpath("./span[@class='lab']/text()").extract_first( default="") value_list = one_item.xpath( "string(./span[@class='rcont'])").extract() if 0 < len(key): community_dict[key] = "___".join(value_list) community_box2line2 = community_box2line1.xpath( "./following-sibling::div") line2_list = community_box2line2.xpath( "./div[@class='text-item clearfix']") for one_item in line2_list: key = one_item.xpath("./span[@class='lab']/text()").extract_first( default="") value = one_item.xpath( "./span[@class='rcont ']/text()").extract_first(default="") if 0 < len(key): key = CommonClass.clean_string(string=key, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) community_dict[key] = CommonClass.clean_string(string=value, char_to_remove=[ '\xa0', '\n', '\t', ' ', '\r', ]) community_box2line3 = community_box2line2.xpath( "./following-sibling::div") community_box2line3key = community_box2line3.xpath( "./div[@class='text-item']/span[@class='lab']/text()" ).extract_first(default="") community_box2line3value = community_box2line3.xpath( "string(./div[@class='text-item']/span[@class='rcont'])").extract( ) temp_list = [] for one_value in community_box2line3value: temp = CommonClass.clean_string(string=one_value, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) temp = temp.strip('\r') if 0 < len(temp): temp_list.append(temp) if 0 < len(community_box2line3key): community_dict[community_box2line3key] = "".join(temp_list) text = { "title": title.strip(), "price": price.strip(), "feature": feature_dict, "location": location_dict, "information": information_dict, "community": community_dict, "city": city, "apt_id": apt_id, } return text
class FangesfSpider(scrapy.Spider): """ 在分布式scrapyd部署之前,为了起多个fangesf进程而采取的临时措施(fangesfp2是本套代码的一个拷贝)。 sys.exit code == 1 # wrong or missing RUN_PURPOSE sys.exit code == 2 # wrong or missing CRAWLED_DIR, SAVED_DETAIL_HTML, or SAVED_GAODE_JASON sys.exit code == 3 # fail to get proxy's ip On 20190605 Peter writes this spider upon requests """ name = "fangesf" root_path = "" log_dir = "" resume_break_point_detailed_file_name = "crawled_detailed_html.log" resume_break_point_list_file_name = "crawled_list_html.log" crawled_list_url_list = [] crawled_detailed_url_list = [] debug = False city_list = [] district_list = [] city_name_for_districts = "" run_purpose = None save_every_response = False overwrite_today = "" crawled_dir = "" saved_html_dir = "" gaode_json_dir = "" csv_file_path = None bedrooms_links = [ "g21", "g22", "g23", "g24", "g25", "g299", ] over100_filename = "" custom_settings = CommonClass.get_custom_settings_dict(spider=name) proxy_ip_dict = {} min_proxy_ip_life_time = 6 max_proxy_ip_life_time = 180 use_proxy = False proxy_agent = "" def init_self_attributes(self): self.root_path = self.settings.get("PROJECT_PATH") self.log_dir = self.settings.get(name="LOG_DIR", default="") self.debug = self.settings.get(name="PROJECT_DEBUG", default=False) self.city_name_for_districts = self.settings.get( "CITY_NAME_FOR_DISTRICTS", default="city") self.district_list = self.settings.get("DISTRICT_LIST", default=[]) if 1 > len( self.district_list) and "city" != self.city_name_for_districts: self.logger.error( f"missing DISTRICT_LIST ({self.district_list}) setting") sys.exit(1) self.city_list = self.settings.get("CITY_LIST", default=[]) if 1 > len(self.city_list) and "city" == self.city_name_for_districts: self.logger.error(f"missing CITY_LIST ({self.city_list}) setting") sys.exit(1) self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None) if self.run_purpose is None: self.logger.error( f"missing RUN_PURPOSE ({self.run_purpose}) setting") sys.exit(2) self.save_every_response = self.settings.get( name="SAVE_EVERY_RESPONSE", default=False) self.overwrite_today = self.settings.get("OVERWRITE_TODAY", default="") if not hasattr(self, "overwrite_today") or 1 > len( self.overwrite_today) or self.overwrite_today is None: self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d") # set all paths self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="") self.saved_html_dir = self.settings.get(name="SAVED_HTML", default="") self.gaode_json_dir = self.settings.get(name="SAVED_GAODE_JASON", default="") self.csv_file_path = os.path.join( self.crawled_dir, f"fang_esf{self.overwrite_today}.csv") if 1 > len(self.crawled_dir) or 1 > len( self.saved_html_dir) or 1 > len(self.gaode_json_dir): error_msg = f"missing CRAWLED_DIR ({self.crawled_dir}), SAVED_HTML ({self.saved_html_dir}), or SAVED_GAODE_JASON ({self.gaode_json_dir}) setting(s)" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) sys.exit(3) self.over100_filename = self.settings.get(name="OVER100_LOG_FILENAME", default="") self.min_proxy_ip_life_time = self.settings.get( name="MIN_PROXY_LIFE_SPAN", default=6) self.max_proxy_ip_life_time = self.settings.get( name="MAX_PROXY_LIFE_SPAN", default=180) self.use_proxy = self.settings.get(name="HTTPPROXY_ENABLED", default=False) self.proxy_agent = self.settings.get(name="PROXY_AGENT", default="") def make_dirs(self): # even cache is used, we save all html files; here we make these 3 dirs if they do not exist if not os.path.isdir(self.crawled_dir): os.makedirs(self.crawled_dir) if not os.path.isdir(self.saved_html_dir): os.makedirs(self.saved_html_dir) if not os.path.isdir(self.gaode_json_dir): os.makedirs(self.gaode_json_dir) def proxy_ip_pool(self): """ 迅联错误码10000 提取过快,请至少5秒提取一次 """ if "DRAGONFLY" == self.proxy_agent: return CommonClass.get_proxies(proxy_dict={}) now = time.time() need_new_proxy = False if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict): need_new_proxy = True elif "expire" not in self.proxy_ip_dict.keys(): need_new_proxy = True elif now + 3 > self.proxy_ip_dict["expire"]: need_new_proxy = True if need_new_proxy: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 1 > len(proxies_dict): return self.proxy_ip_dict # still return the old ip dict or {} proxies_dict["expire"] = now + random.randint( self.min_proxy_ip_life_time, self.max_proxy_ip_life_time) # set ip life time self.proxy_ip_dict = proxies_dict return self.proxy_ip_dict def read_crawled_urls(self): resume_break_point_detailed_file_path = os.path.join( self.log_dir, self.resume_break_point_detailed_file_name) try: with open(resume_break_point_detailed_file_path, "r", encoding="utf-8") as log_file: self.crawled_detailed_url_list = log_file.readlines() while "" in self.crawled_detailed_url_list: self.crawled_detailed_url_list.remove("") except Exception as ex: error_msg = f"fail to read {resume_break_point_detailed_file_path}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) # for list pages, do not use this [] to exclude seen urls # resume_break_point_list_file_path = os.path.join( self.log_dir, self.resume_break_point_list_file_name ) # try: # with open( resume_break_point_list_file_path, "r", encoding="utf-8" ) as log_file: # self.crawled_list_url_list = log_file.readlines() # while "" in self.crawled_list_url_list: # self.crawled_list_url_list.remove("") # except Exception as ex: # error_msg = f"fail to read {resume_break_point_list_file_path}" # self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) def start_requests(self): self.init_self_attributes() self.make_dirs() self.read_crawled_urls() if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request(url=url, callback=self.read_and_parse) elif "PRODUCTION_RUN" == self.run_purpose: if "city" == self.city_name_for_districts: city_list = self.city_list else: city_list = self.district_list number_day_of_this_year = datetime.datetime.now().timetuple( ).tm_yday # type == int seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3) if seperate_into_days > len(city_list): seperate_into_days = len(city_list) batch_count = math.ceil(len(city_list) / seperate_into_days) today_batch = number_day_of_this_year % seperate_into_days start_index = today_batch * batch_count - 1 end_index = (today_batch + 1) * batch_count urls = [] for index, city in enumerate(city_list): if (start_index < index) and (index < end_index): url = f"https://{city}.esf.fang.com/" if "city" == self.city_name_for_districts else f"https://{self.city_name_for_districts}.esf.fang.com/house-{city}/" urls.append(url) meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } if "city" != self.city_name_for_districts: meta_dict["index_level"] = 1 if self.use_proxy: proxies_dict = self.proxy_ip_pool() if 1 > len(proxies_dict): sys.exit(3) meta_dict["proxy"] = proxies_dict["http"] for url in urls: yield scrapy.Request(url=url, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif "GET_CHANNELS" == self.run_purpose: # GET_CHANNELS is one kind of debug urls = [] city_list = self.settings.get("CITY_LIST", default=[]) for index, city in enumerate(city_list): urls.append(f"https://{city}.esf.fang.com/") if 0 < len(urls): meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } yield scrapy.Request(url=urls[0], callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif "CHECK_PROXY_IP" == self.run_purpose: now = int(time.time()) token = f"Guangzhou{str(now)}" m = hashlib.md5() m.update(token.encode(encoding='utf-8')) urls = [ f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}", ] if "DRAGONFLY" == self.proxy_agent: proxies_dict = CommonClass.get_proxies(proxy_dict={}) else: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 0 < len(proxies_dict): meta_dict = {"proxy": proxies_dict["http"]} for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug, meta=meta_dict) else: self.logger.error( f"Error! No proxy ip returns. {proxies_dict}") else: urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug) def get_total_pages(self, response=None): """ if ONE page already includes all records, there is still one element called "共1页" if ONE page includes 0 record, there is no element called "共x页" """ total_pages = 0 if response is None: return total_pages all_ps = response.xpath("//div[@id='list_D10_15']/p") total_pages_p = "" for one_p in all_ps: total_pages_p = one_p.xpath("./text()").extract_first(default="") if 0 < len(total_pages_p) and -1 < total_pages_p.find("共"): break if -1 < total_pages_p.find("共"): search_obj = re.search(r"(\d)+", total_pages_p, re.M | re.I) if search_obj is not None: start = search_obj.span()[0] end = search_obj.span()[1] if 0 < len(total_pages_p[start:end]): total_pages = int(total_pages_p[start:end]) else: error_msg = f"cannot find total page at uri {response.url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return total_pages def get_city_from_url(self, url=""): city = "" result_obj = parse.urlparse(url) if -1 < result_obj.netloc.find("fang.com"): temp2_list = result_obj.netloc.split(".") if 4 == len(temp2_list): city = temp2_list[0] return city def make_html_file_name(self, url="", city=""): now = datetime.datetime.now() html_filename = "{}.html".format(now.strftime("%Y%m%d_%H%M%S")) today = now.strftime("%Y%m%d") result_obj = parse.urlparse(url) url_list = result_obj.path.split("/") while "" in url_list: url_list.remove("") detail_page = False last_part = url_list[len(url_list) - 1] if 0 < len(url_list) else "" if -1 < last_part.find(".htm"): detail_page = True # /chushou/3_218307566.htm ==> https://sz.esf.fang.com/chushou/3_218307566.htm temp = last_part.split("_") apt_id = f"{last_part}" if 1 < len(temp): apt_id = f"{temp[1]}" html_filename = f"{city}_{apt_id}_{today}.html" elif -1 < result_obj.netloc.find("fang.com") and 1 > len(url_list): # list page #1: https://sz.esf.fang.com/ html_filename = f"{city}_index1_{today}.html" else: page, district_area, bedrooms = self.get_page_and_district_area( url_list=url_list) if 0 < len(district_area): html_filename = f"{city}_{district_area}_index{page}_{today}.html" else: html_filename = f"{city}_index{page}_{today}.html" return (detail_page, html_filename) def get_page_and_district_area(self, url_list=[]): """ list page #2 or more, or including channels like: https://sz.esf.fang.com/house-a013080/ where a013080 stands for 深圳市龙华区 or https://sz.esf.fang.com/house-a013080-b014334 or https://sz.esf.fang.com/house-a013080-b02094/i372/ where b014334 stands for 深圳市龙华区大浪;house-a013080-b02094 stands for 观澜; house-a013080-b0350 stands for 龙华;house-a013080-b014333 stands for 民治 https://sz.esf.fang.com/house-a087-b0342/g22/ where g22 stands for 二居室; g21(一居),g23(三居),g24(四居),g25(五居),g299(五居以上) # this option is a multiple choice but this crawl will ONLY use single choice """ page = "1" district_area = "" bedrooms = 0 for index, key in enumerate(url_list): one_fragment = url_list[index] if -1 < one_fragment.find("i3") and -1 == one_fragment.find( "house-"): page = one_fragment[2:] elif -1 < one_fragment.find("house-") and -1 == one_fragment.find( "i3"): district_area = one_fragment.replace("house-", "") district_area = district_area.replace("-", "_") if index + 1 < len(url_list): next_fragment = url_list[index + 1] if -1 < next_fragment.find("g2"): last_part_of_fragment = next_fragment.replace("g2", "") if -1 < last_part_of_fragment.find("-i3"): temp_list = last_part_of_fragment.split("-i3") if 1 < len(temp_list): bedrooms = int(temp_list[0]) else: bedrooms = int(last_part_of_fragment) return (page, district_area, bedrooms) def save_html(self, response=None, save100=False): if response is None or not hasattr(response, "meta") or not hasattr( response, "body") or not hasattr(response, "url"): if hasattr(response, "url"): error_msg = f"fail to save response.body after requesting {response.url}; response has no body or meta attribute(s)" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return -1 url = response.url meta_dict = response.meta page_type = "index" total_pages = 0 city = self.get_city_from_url(url=url) if "page_type" in meta_dict.keys(): page_type = meta_dict["page_type"] if "index" == page_type: if "total_pages" in meta_dict.keys(): total_pages = int(meta_dict["total_pages"]) if 0 == total_pages: total_pages = self.get_total_pages(response=response) if 99 < total_pages and not save100: return 101 # https://sz.esf.fang.com/house-a013080/ detail_page, html_filename = self.make_html_file_name(url=url, city=city) html_file_path = os.path.join(self.saved_html_dir, html_filename) save_html_file = True elif "detailed" == page_type: apt_id = self.get_apt_id(url=url) today = datetime.datetime.now().strftime("%Y%m%d") html_filename = f"{city}___{apt_id}___{today}.html" html_file_path = os.path.join(self.saved_html_dir, html_filename) save_html_file = True total_pages = 1001 # https://sz.esf.fang.com/chushou/3_218307566.htm try: with open(html_file_path, "wb") as f: f.write(response.body) except Exception as ex: error_msg = f"fail to write response.body into {html_file_path} after requesting {url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return -2 else: if 1 > total_pages: error_msg = f"response.body saved after requesting {response.url}; but fail to extract total page number from response.body" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return total_pages # could be 100 when save100 = True def extract_link_list(self, response=None): link_list = response.xpath( '//div[@class="shop_list shop_list_4"]/dl[@class="clearfix"]/dd/h4[@class="clearfix"]/a/@href' ).extract() if 1 > len(link_list): error_msg = f"Fail to extract links from {response.url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return link_list def divide_request_into_next_level(self, response=None): if response is None or not hasattr(response, "meta") or not hasattr( response, "body") or not hasattr(response, "url"): error_msg = f"meta = {hasattr( response, 'meta' )}; body = {hasattr( response, 'body' )}; url = {hasattr( response, 'url' )}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return -1 url = response.url result_obj = parse.urlparse(url) url_list = result_obj.path.split("/") while "" in url_list: url_list.remove("") meta_dict = response.meta index_level = 0 if "index_level" in meta_dict.keys(): index_level = int(meta_dict["index_level"]) page, district_area, bedrooms = self.get_page_and_district_area( url_list=url_list) if 0 < bedrooms: # as of 20190605, we ONLY care level upto bedrooms page_status = self.save_html(response=response, save100=True) self.write_log(content=f"{response.url}", logfilename=self.over100_filename, content_only=True) # district_area has higher priority than index_level if 0 < len(district_area): temp_list = district_area.split("_") if index_level != len(temp_list): error_msg = f"index_level {index_level} != {len( temp_list )} ({district_area})" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) index_level = len(temp_list) else: if 0 != index_level: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {index_level} is not 0" ) index_level = 0 pointer, link_list = self.extract_this_level_screen_options( response=response, index_level=index_level, district_area=district_area, bedrooms=0) return (pointer, link_list, index_level) def make_new_url(self, url="", index_level=0, fragment=""): result_obj = parse.urlparse(url) url_path_list = result_obj.path.split("/") while "" in url_path_list: url_path_list.remove("") has_bedroom = False for one_path in url_path_list: if 0 == one_path.find("i3"): return "" # https://sz.esf.fang.com/house-a013080-b02094/i372/ if 0 == one_path.find("g2"): has_bedroom = True if 2 == index_level: if has_bedroom and 0 == fragment.find("g2"): return "" # ONLY one option is selected return_url = f"{result_obj.scheme}://{result_obj.netloc}/{result_obj.path}{fragment}" if 0 == result_obj.path.find('/'): return_url = f"{result_obj.scheme}://{result_obj.netloc}{result_obj.path}{fragment}" return return_url # returns the first url: https://sz.esf.fang.com/house-a090-b0352/g23/ # but for page #2 and above, url shall be: https://sz.esf.fang.com/house-a090-b0352/g23-i37/ return_url = f"{result_obj.scheme}://{result_obj.netloc}/{fragment}" if 0 == fragment.find('/'): return_url = f"{result_obj.scheme}://{result_obj.netloc}{fragment}" return return_url def extract_this_level_screen_options(self, response=None, index_level=0, district_area="", bedrooms=0): """ currently ONLY 1 > pointer will be returned """ link_list = [] if 1 > index_level: link_list = response.xpath( '//div[@class="screen_al"]/ul/li[@class="clearfix screen_list"]/ul[@class="clearfix choose_screen floatl"]/li/a/@href' ).extract() # remove 地铁线路 temp_list = [] for one_link in link_list: if -1 == one_link.find("house1-"): temp_list.append(one_link) link_list = temp_list elif 1 == index_level: link_list = response.xpath( '//div[@class="screen_al"]/ul/li[@class="area_sq"]/ul[@class="clearfix"]/li/a/@href' ).extract() elif 2 == index_level: if 0 < bedrooms: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {bedrooms} is not 0; as of 20190605, we ONLY have 3 levels to divide requests" ) return (-1, []) # this for future ONLY return (0, self.bedrooms_links) else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {index_level} over 2; as of 20190605, we ONLY have 3 levels to divide requests" ) return (-1, []) temp_list = [] for one_link in link_list: temp_string = one_link.replace("/", "") temp_string = temp_string.replace("house-", "") temp_list.append(temp_string.replace("-", "_")) pointer = 0 # currently ONLY 1 > pointer will be returned return (pointer, link_list) def load_items_into_loader(self, loader=None, text={}, url=""): loader.add_value("content", str(text)) # , encoding="utf-8" loader.add_value("page_type", "detailed") # record housekeeping fields loader.add_value("url", url) loader.add_value("project", self.settings.get('BOT_NAME')) loader.add_value("spider", self.name) loader.add_value("server", socket.gethostname()) loader.add_value("date", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) return loader def parse_detailed_response_field(self, response=None, city="", apt_id=""): text = {} if response is None: return text if "READ_HTML" == self.run_purpose and not isinstance( response, Selector): return text title = response.xpath("//div[@id='lpname']/h1/text()").extract_first( default="") if 1 > len(title): title = response.xpath( "//div[@class='tab-cont clearfix']/div[@class='title rel']/h1[@class='title floatl']/text()" ).extract_first(default="") title_right_box = response.xpath("//div[@class='tab-cont-right']") price_div = title_right_box.xpath( "./div[@class='tr-line clearfix zf_new_title']/div[@class='trl-item_top']/div[@class='rel floatl']/preceding-sibling::div" ) price_list = price_div.xpath("string(.)").extract() price = "___".join(price_list) # extract features feature_div = title_right_box.xpath( "./div[@class='tr-line clearfix']/div[contains(@class,'trl-item1')]" ) feature_dict = {} for one_item in feature_div: key = one_item.xpath( "./div[@class='font14']/text()").extract_first(default="") value = one_item.xpath("./div[@class='tt']/text()").extract_first( default="") if 0 < len(key): feature_dict[key] = CommonClass.clean_string(string=value, char_to_remove=[ '\r', '\n', '\t', ' ', ]) # extract location information location_div = title_right_box.xpath( "./div[@class='tr-line']/div[@class='trl-item2 clearfix']") location_dict = {} for one_location in location_div: key = one_location.xpath( "./div[@class='lab']/text()").extract_first(default="") value_list = one_location.xpath( "string(./div[@class='rcont'])").extract() temp_list = [] for one_value in value_list: temp = CommonClass.clean_string(string=one_value, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) temp_list.append(temp.strip('\r')) # keep \r if 0 < len(key): key = CommonClass.clean_string(string=key, char_to_remove=[ '\u2003', '\xa0', '\n', '\t', ' ', ]) location_dict[key] = "___".join(temp_list) information_box = response.xpath( "//div[@class='content-item fydes-item']") information_title_list = information_box.xpath( "string(./div[@class='title'])").extract() information_title = "___".join( information_title_list) if 0 < len(information_title_list) else "" information1div = information_box.xpath( "./div[@class='cont clearfix']/div[@class='text-item clearfix']") information_dict = {} for one_item in information1div: key = one_item.xpath("./span[@class='lab']/text()").extract_first( default="") value_list = one_item.xpath( "string(./span[@class='rcont'])").extract() temp_list = [] for one_value in value_list: temp = CommonClass.clean_string(string=one_value, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) temp_list.append(temp.strip('\r')) if 0 < len(key): information_dict[key] = "___".join(temp_list) community_box1 = response.xpath("//div[@id='xq_message']") community_title = community_box1.xpath("./text()").extract_first( default="") community_title = CommonClass.clean_string(string=community_title, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) community_dict = { "title": community_title.strip('\r'), } community_box2 = community_box1.xpath("./following-sibling::div") community_box2line1 = community_box2.xpath( "./div[@class='topt clearfix']") line1_list = community_box2line1.xpath( "./div[@class='text-item clearfix']") for one_item in line1_list: key = one_item.xpath("./span[@class='lab']/text()").extract_first( default="") value_list = one_item.xpath( "string(./span[@class='rcont'])").extract() if 0 < len(key): community_dict[key] = "___".join(value_list) community_box2line2 = community_box2line1.xpath( "./following-sibling::div") line2_list = community_box2line2.xpath( "./div[@class='text-item clearfix']") for one_item in line2_list: key = one_item.xpath("./span[@class='lab']/text()").extract_first( default="") value = one_item.xpath( "./span[@class='rcont ']/text()").extract_first(default="") if 0 < len(key): key = CommonClass.clean_string(string=key, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) community_dict[key] = CommonClass.clean_string(string=value, char_to_remove=[ '\xa0', '\n', '\t', ' ', '\r', ]) community_box2line3 = community_box2line2.xpath( "./following-sibling::div") community_box2line3key = community_box2line3.xpath( "./div[@class='text-item']/span[@class='lab']/text()" ).extract_first(default="") community_box2line3value = community_box2line3.xpath( "string(./div[@class='text-item']/span[@class='rcont'])").extract( ) temp_list = [] for one_value in community_box2line3value: temp = CommonClass.clean_string(string=one_value, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) temp = temp.strip('\r') if 0 < len(temp): temp_list.append(temp) if 0 < len(community_box2line3key): community_dict[community_box2line3key] = "".join(temp_list) text = { "title": title.strip(), "price": price.strip(), "feature": feature_dict, "location": location_dict, "information": information_dict, "community": community_dict, "city": city, "apt_id": apt_id, } return text def get_apt_id(self, url=""): apt_id = 0 result_obj = parse.urlparse(url) url_list = result_obj.path.split("/") while "" in url_list: url_list.remove("") last_part = url_list[len(url_list) - 1] if -1 < last_part.find(".htm"): temp = last_part.split("_") if 1 < len(temp): temp = f"{temp[1]}" search_obj = re.search(r"(\d)+", temp, re.M | re.I) if search_obj is not None: start = search_obj.span()[0] end = search_obj.span()[1] if 0 < len(temp[start:end]): apt_id = int(temp[start:end]) if 1 > apt_id: return f"random{random.randint(10000,99999)}" return str(apt_id) def log_for_picking_up_the_crawl_break_point(self, page_type="detailed", response=None): if "detailed" == page_type: resume_break_point_file_path = os.path.join( self.log_dir, self.resume_break_point_detailed_file_name) else: resume_break_point_file_path = os.path.join( self.log_dir, self.resume_break_point_list_file_name) try: with open(resume_break_point_file_path, "a") as f: f.write(f"{response.url}\n") except Exception as ex: error_msg = f"fail to write response.url into {resume_break_point_file_path}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) def parse_detailed_page(self, response=None): url = response.url result_obj = parse.urlparse(url) has_url_error = self.url_contains_error( result_obj_path=result_obj.path) if has_url_error: return False page_status = self.save_html(response=response, save100=True) city = self.get_city_from_url(url=url) apt_id = self.get_apt_id(url=url) text = self.parse_detailed_response_field(response=response, city=city, apt_id=apt_id) try: loader = ItemLoader(item=FangesfItem(), response=response) loader = self.load_items_into_loader(loader=loader, text=text, url=url) self.log_for_picking_up_the_crawl_break_point(page_type="detailed", response=response) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}" ) def do_nothing_for_debug(self, response=None): self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}" ) # print( response.body ) # Inside Method request_proxy_ip of Class ProxyAgent, proxy server returns [{'IP': '49.87.226.131:10749'}] # b'{"REMOTE_ADDR":"49.87.226.131","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"49.87.226.131, 49.87.226.131"}' # 2019-06-20 16:28:55 [fangesf] INFO: Inside Method do_nothing_for_debug of Class FangesfSpider, # url = https://www.coursehelper.site/index/index/getHeaders?token=ad89558c89c3394167adbfd1484c8700 # 2019-06-20 16:28:55 [stdout] INFO: b'{"REMOTE_ADDR":"139.196.200.61","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"139.196.200.61, 139.196.200.61"}' def url_contains_i3_page(self, result_obj_path=""): if 1 > len(result_obj_path): return False path_fragment_list = result_obj_path.split("/") if 1 > len(path_fragment_list): return False for one in path_fragment_list: if 0 == one.find("i3"): return True elif 0 == one.find("g2") and 0 < one.find("-i3"): # the bedroom url looks like g299-i39 return True return False def url_contains_error(self, result_obj_path=""): if 1 > len(result_obj_path): return False path_fragment_list = result_obj_path.split("/") if 1 > len(path_fragment_list): return False # https://sz.esf.fang.com/staticsearchlist/Error/Error404?aspxerrorpath=/house-a013057/i330/i330 for one in path_fragment_list: if -1 < one.find("Error") or -1 < one.find( "Error404") or -1 < one.find("staticsearchlist"): self.logger.info( f"Error! Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {result_obj_path}" ) return True # http://search.fang.com/captcha-verify/?t=1559927114.963&h=aHR0cHM6Ly9zei5lc2YuZmFuZy5jb20vaG91c2UtYTA5MC1iMDM1NC9nMjU%3D&c=cmE6MTE0LjI1Mi4yMTIuMjEwO3hyaTo7eGZmOg%3D%3D for one in path_fragment_list: if -1 < one.find("captcha") or -1 < one.find("verify"): self.logger.info( f"Need captcha-verify! Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {result_obj_path}" ) return True return False def parse_list_page(self, response=None): """ 0 == index_level: https://shaoguan.esf.fang.com/house/i32/ or https://shaoguan.esf.fang.com 1 == index_level: https://gz.esf.fang.com/house-a072/i34/ or https://gz.esf.fang.com/house-a072/ 2 == index_level: https://gz.esf.fang.com/house-a072-b0627/i35/ or https://gz.esf.fang.com/house-a072-b0627/ 3 == index_level: https://sz.esf.fang.com/house-a090-b0352/g23-i37/ or https://sz.esf.fang.com/house-a090-b0352/g23/ """ result_obj = parse.urlparse(response.url) has_url_error = self.url_contains_error( result_obj_path=result_obj.path) if has_url_error: return False page_status = self.save_html(response=response, save100=False) if 1 > page_status: pass # -2, -1, 0: error_msg has been logged; just pass elif 0 < page_status and 101 > page_status and not has_url_error: # 1 to 100 also means "index" == page_type link_list = self.extract_link_list(response=response) if self.debug: self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}; link_list = {link_list}" ) else: self.log_for_picking_up_the_crawl_break_point( page_type="index", response=response) new_url = f"{result_obj.scheme}://{result_obj.netloc}" # crawling vertically meta_dict = { "page_type": "detailed", "total_pages": 1, } if self.use_proxy: proxies_dict = self.proxy_ip_pool() meta_dict["proxy"] = proxies_dict["http"] for one_link in link_list: if 0 != one_link.find('/'): one_link = f"/{one_link}" this_i_url = f"{new_url}{one_link}" if this_i_url in self.crawled_detailed_url_list: self.logger.info(f"previously crawled {this_i_url}") else: self.logger.info(f"requesting {this_i_url}") yield scrapy.Request(url=this_i_url, callback=self.parse_detailed_page, meta=meta_dict, dont_filter=False) # crawling horizontally if 1 < page_status and not self.url_contains_i3_page( result_obj_path=result_obj.path): meta_dict = response.meta meta_dict["total_pages"] = page_status new_url = f"{new_url}{result_obj.path}" if len(new_url) - 1 != new_url.rfind('/'): new_url = f"{new_url}/" is_bedroom_url = False # https://sz.esf.fang.com/house-a090-b0352/i36/ if "index_level" in meta_dict.keys() and 3 == int( meta_dict["index_level"]): is_bedroom_url = True new_url = new_url.rstrip('/') # https://sz.esf.fang.com/house-a090-b0352/g23-i37/ elif "index_level" in meta_dict.keys() and 0 == int( meta_dict["index_level"]): if 1 > len(result_obj.path): new_url = f"{new_url}house/" elif -1 == result_obj.path.find("house"): new_url = f"{new_url}house/" # this city ONLY has 2 to 99 list pages and there is no need to divide requests into next level # therefore 0 == index_level # https://shaoguan.esf.fang.com/house/i32/ if self.use_proxy: proxies_dict = self.proxy_ip_pool() meta_dict["proxy"] = proxies_dict["http"] for i in range(page_status - 1): this_i_url = f"{new_url}-i3{i + 2}" if is_bedroom_url else f"{new_url}i3{i + 2}" self.logger.info( f"requesting list page at {this_i_url}") yield scrapy.Request(url=f"{this_i_url}", callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif 101 == page_status and not has_url_error: # 101 also means "index" == page_type self.log_for_picking_up_the_crawl_break_point(page_type="index", response=response) pointer, link_list, index_level = self.divide_request_into_next_level( response=response) # https://sz.esf.fang.com/house-a090-b0352/g23-i37/ if -1 < pointer: # using level3 bedrooms meta_dict = { "page_type": "index", "total_pages": 0, "index_level": index_level + 1, } if self.use_proxy: proxies_dict = self.proxy_ip_pool() meta_dict["proxy"] = proxies_dict["http"] for i in range(len(link_list) - pointer): new_url = self.make_new_url(url=response.url, index_level=index_level, fragment=link_list[i + pointer]) if 0 < len(new_url): self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, requesting {new_url}; meta_dict = {meta_dict}" ) yield scrapy.Request(url=new_url, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif 1001 == page_status and not has_url_error: self.parse_detailed_page(response=response) # 1001 also means "detailed" == page_type # will never reach here because self.parse_detailed_page() is the callback method def read_and_parse(self, response=None): file_list = os.listdir(self.saved_html_dir) for one_file in file_list: if -1 == one_file.find("index"): temp_list = one_file.split("___") apt_id = 0 city = "" if 1 < len(temp_list): apt_id = temp_list[1] city = temp_list[0] url = f"https://{city}.esf.fang.com/chushou/3_{apt_id}.htm" # can also be 16_, 10_, and others # https://sz.esf.fang.com/chushou/3_218307566.htm html_file_path = os.path.join(self.saved_html_dir, one_file) if os.path.isfile(html_file_path): doc = None with open(html_file_path, 'rb') as f: # doc = f.read().decode('gb2312', 'ignore') doc = f.read().decode('utf-8', 'ignore') if doc is None: self.logger.error( f"Error: cannot read html file {html_file_path}.") continue response = Selector(text=doc, type="html") text = self.parse_detailed_response_field( response=response, city=city, apt_id=apt_id) try: response_for_items = TextResponse( url=url, status=200, body=bytes(doc, encoding="utf-8")) loader = ItemLoader(item=FangesfItem(), response=response_for_items) loader = self.load_items_into_loader(loader=loader, text=text, url=url) yield loader.load_item() except Exception as ex: self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, Exception = {ex}" ) if self.debug: break def write_log(self, content=None, logfilename=None, content_only=False): if content is not None and 0 < len(content): today = datetime.datetime.now().strftime("%Y%m%d") if logfilename is None: logfilename = f"{self.name}{today}.log" try: with open(os.path.join(self.log_dir, logfilename), 'a', encoding='utf-8') as f: if content_only: info = f"{str(content)}\n" else: info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n" f.write(info) return 1 except Exception as ex: return 0 return -1
def start_requests(self): self.init_self_attributes() self.make_dirs() self.read_crawled_urls() if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request(url=url, callback=self.read_and_parse) elif "PRODUCTION_RUN" == self.run_purpose: if "city" == self.city_name_for_districts: city_list = self.city_list else: city_list = self.district_list number_day_of_this_year = datetime.datetime.now().timetuple( ).tm_yday # type == int seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3) if seperate_into_days > len(city_list): seperate_into_days = len(city_list) batch_count = math.ceil(len(city_list) / seperate_into_days) today_batch = number_day_of_this_year % seperate_into_days start_index = today_batch * batch_count - 1 end_index = (today_batch + 1) * batch_count urls = [] for index, city in enumerate(city_list): if (start_index < index) and (index < end_index): url = f"https://{city}.esf.fang.com/" if "city" == self.city_name_for_districts else f"https://{self.city_name_for_districts}.esf.fang.com/house-{city}/" urls.append(url) meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } if "city" != self.city_name_for_districts: meta_dict["index_level"] = 1 if self.use_proxy: proxies_dict = self.proxy_ip_pool() if 1 > len(proxies_dict): sys.exit(3) meta_dict["proxy"] = proxies_dict["http"] for url in urls: yield scrapy.Request(url=url, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif "GET_CHANNELS" == self.run_purpose: # GET_CHANNELS is one kind of debug urls = [] city_list = self.settings.get("CITY_LIST", default=[]) for index, city in enumerate(city_list): urls.append(f"https://{city}.esf.fang.com/") if 0 < len(urls): meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } yield scrapy.Request(url=urls[0], callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif "CHECK_PROXY_IP" == self.run_purpose: now = int(time.time()) token = f"Guangzhou{str(now)}" m = hashlib.md5() m.update(token.encode(encoding='utf-8')) urls = [ f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}", ] if "DRAGONFLY" == self.proxy_agent: proxies_dict = CommonClass.get_proxies(proxy_dict={}) else: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 0 < len(proxies_dict): meta_dict = {"proxy": proxies_dict["http"]} for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug, meta=meta_dict) else: self.logger.error( f"Error! No proxy ip returns. {proxies_dict}") else: urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug)
class TrafficSpider(scrapy.Spider): """ sys.exit code == 1 # missing AMAP_KEYS sys.exit code == 2 # missing INPUT_XY_FILE_PATH sys.exit code == 3 # fail to generate self.rectangle_list sys.exit code == 4 # self.rectangle_list have wrong length """ name = "traffic" root_path = "" log_dir = "" xy_response_log_file_name = "" city_or_area_name = "" # debug = False # save_every_response = False crawled_dir = "" json_dir = "" output_folder_name = "" output_file_format = "json" base_uri = "" run_purpose = None overwrite_today = "" custom_settings = CommonClass.get_custom_settings_dict(spider=name) # crontab will start a new process in every 6 hours; therefore in 1 day, the crontab will start 4 times # 5 minute between 2 adjacent reqests # the followings will be inititated in every 6 hours maximal_requests_of_one_crontab_process = 71 interval_between_requests = 300 amap_key_list = [] amap_key_pointer = 0 request_counter = 0 # from 0 to 71 request_number_per_batch = 405 # there are 405 requests in every 5 minutes # request_number_per_batch = 10 rectangle_list = [ ] # items looks like "113.267593,23.358604;113.337613,23.412658", it equals to list( self.edges_of_center_xy_dict.keys() ) edges_of_center_xy_dict = { } # key looks like "113.267593,23.358604;113.337613,23.412658" and item looks like "113.737414,22.543564" xy_seen_dict = { } # key looks like "113.737414,22.543564" and item looks like 23 xy_seen_updated_bool = False # the followings will be initiated in every 5 minutes last_batch_request_list = [] last_batch_request_timestamp_float = 0.0 # if good response returned, then we use self.last_batch_request_timestamp_float urls = [] def get_next_amap_key(self): self.amap_key_pointer -= 1 if 0 > self.amap_key_pointer: self.amap_key_pointer = len(self.amap_key_list) - 1 return self.amap_key_list[self.amap_key_pointer] def get_one_batch_urls(self): """ https://restapi.amap.com/v3/traffic/status/rectangle?level=6&extensions=all&output=json&rectangle=113.2675927679,23.3586043241;113.3376127679,23.4126583781&key=4ebb849f151dddb3e9aab7abe6e344e2 """ self.urls = [] self.last_batch_request_list = [] query_dict = { "level": 6, "extensions": "all", "output": "json", "key": self.get_next_amap_key(), } for one_retangle in self.rectangle_list: query_dict["rectangle"] = one_retangle self.last_batch_request_list.append(one_retangle) self.urls.append(f"{self.base_uri}?{parse.urlencode(query_dict)}") self.last_batch_request_timestamp_float = time.time() return self.urls def generate_one_rectange(self, center_xy_str=""): if not isinstance(center_xy_str, str) or 1 > len(center_xy_str): return "" xy_list = center_xy_str.split(",") x = float(xy_list[0]) y = float(xy_list[1]) # edge = 3.0 km # lat_delta = 0.009009009*edge = 0.027027027 # 赤道长度40075公里;北纬23度每一经度长40075 * sin(90-23) / 360 = 36889.23 / 360 = 102.47008889公里 # lng_delta = 0.009759*edge = 0.0292768 return "%.6f,%.6f;%.6f,%.6f" % (x - 0.0292768, y - 0.027027027, x + 0.0292768, y + 0.027027027) def init_self_rectangles(self): if isinstance(self.rectangle_list, list) and 0 < len(self.rectangle_list): return self.rectangle_list try: with open(self.input_xy_file_path, "r", encoding="utf-8") as xy_file: overall_list = xy_file.readlines() for index, one_xy in enumerate(overall_list): xy_list = one_xy.split(",") if isinstance(xy_list, list) and 2 == len(xy_list): center_xy = "%.6f,%.6f" % (float( xy_list[0]), float(xy_list[1])) rect_str = self.generate_one_rectange( center_xy_str=center_xy) self.rectangle_list.append(rect_str) self.edges_of_center_xy_dict[rect_str] = center_xy except Exception as ex: self.logger.error( f"cannot read xy_list file ({xy_file_path}). Exception = {ex}") sys.exit(3) else: return self.rectangle_list def init_self_xy_response_log(self): log_file_path = os.path.join(self.log_dir, self.xy_response_log_file_name) try: with open(log_file_path, "r", encoding="utf-8") as xy_log_file: overall_list = xy_log_file.readlines() for index, one_xy in enumerate(overall_list): xy_list = one_xy.split(",") if isinstance(xy_list, list) and 3 == len(xy_list): center_xy = "%.6f,%.6f" % (float( xy_list[0]), float(xy_list[1])) self.xy_seen_dict[center_xy] = int(xy_list[2]) except Exception as ex: self.logger.error( f"cannot read historical xy_log_file ({log_file_path}). Exception = {ex}" ) # do not sys.exit(3) here self.xy_seen_updated_bool = True return False else: return True def init_self_attributes(self): self.root_path = self.settings.get("PROJECT_PATH") self.log_dir = self.settings.get(name="LOG_DIR", default="") self.xy_response_log_file_name = self.settings.get( name="XY_RESPONSE_LOG_FILE_NAME", default="") self.city_or_area_name = self.settings.get(name="CITY_OR_AREA_NAME", default="") # self.debug = self.settings.get( name = "PROJECT_DEBUG", default=False ) # self.save_every_response = self.settings.get( name = "SAVE_EVERY_RESPONSE", default=False ) self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="") self.json_dir = self.settings.get(name="SAVED_JSON", default="") self.output_folder_name = self.settings.get(name="OUTPUT_FOLDER_NAME", default="") self.base_uri = self.settings.get(name="BASE_URI", default="") self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None) self.overwrite_today = self.settings.get(name="OVERWRITE_TODAY", default="") self.maximal_requests_of_one_crontab_process = self.settings.get( name="MAXIMAL_REQUESTS_OF_ONE_CRONTAB_PROCESS", default=71) self.interval_between_requests = self.settings.get( name="INTERVAL_BETWEEN_REQUESTS", default=300) self.amap_key_list = self.settings.get(name="AMAP_KEYS", default=[]) if 1 > len(self.amap_key_list): self.logger.error(f"self.amap_key_list is empty") sys.exit(1) self.input_xy_file_path = self.settings.get(name="INPUT_XY_FILE_PATH", default="") if not isinstance(self.input_xy_file_path, str) or 1 > len(self.input_xy_file_path): self.logger.error(f"missing INPUT_XY_FILE_PATH") sys.exit(2) self.init_self_rectangles() if self.request_number_per_batch != len(self.rectangle_list): self.logger.error( f"self.rectangle_list length shall be {self.request_number_per_batch}" ) sys.exit(4) self.init_self_xy_response_log() def check_dirs_and_files(self): if not os.path.isdir(self.crawled_dir): os.makedirs(self.crawled_dir) if not os.path.isdir(self.json_dir): os.makedirs(self.json_dir) def start_requests(self): self.init_self_attributes() self.check_dirs_and_files() if "INITIALIZE_AMAP_XY" == self.run_purpose: """ 生成xy坐标文件需要将山上、水面上的坐标去除掉。另外20190703发现广东东莞是没有数据的。要将广东东莞的数据删除掉 """ xy_file_name = "data4cities_bd09.txt" xy_file_path = os.path.join(self.root_path, self.name, xy_file_name) try: with open(xy_file_path, "r", encoding="utf-8") as f: overall_lines = f.readlines() overall_list = overall_lines[0].split(";") for index, one_xy in enumerate(overall_list): xy_list = one_xy.split(",") if isinstance(xy_list, list) and 2 == len(xy_list): xy = "%.6f,%.6f" % (float( xy_list[0]), float(xy_list[1])) one_url = f"https://restapi.amap.com/v3/assistant/coordinate/convert?locations={xy}&coordsys=baidu&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2" meta_dict = { "x": float(xy_list[0]), "y": float(xy_list[1]), "index": index, } yield scrapy.Request( url=one_url, callback=self.initialize_amap_xy, meta=meta_dict, dont_filter=True) except Exception as ex: urls = [] self.logger.error( f"cannot read xy_list file ({xy_file_path}). Exception = {ex}" ) elif "READ_JSON_AND_WRITE_CSV" == self.run_purpose: one_url = "https://blog.csdn.net/qq_37193537/article/details/78987949" callback_func = self.read_json_and_parse yield scrapy.Request(url=one_url, callback=callback_func, dont_filter=True) else: self.get_one_batch_urls() meta_dict = { # we use self.last_batch_request_timestamp_float "redo_counter": 0, } for index, one_url in enumerate(self.urls): meta_dict["center_xy_index"] = index self.logger.info(f"{index}: requesting {one_url} ") yield scrapy.Request(url=one_url, callback=self.parse_json, meta=meta_dict, dont_filter=True) def initialize_amap_xy(self, response): if response is None or not hasattr(response, "body") or not hasattr( response, "url") or not hasattr(response, "meta"): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object" ) return None meta_dict = response.meta bd09xy = "%.6f,%.6f" % (meta_dict["x"], meta_dict["y"]) index = meta_dict["index"] json_dict = json.loads(response.body) if "status" not in json_dict.keys( ) or "locations" not in json_dict.keys() or 1 != int( json_dict["status"]): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response status" ) return None if not isinstance(json_dict["locations"], str) or 1 > len(json_dict["locations"]): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response locations" ) return None amap_xy = json_dict["locations"] this_row = f"{index}:{bd09xy}==>{amap_xy}" new_xy_file_name = "data4cities_amap.txt" new_xy_log_file_name = "bd09to_amap.log" new_xy_log_file_name = os.path.join(self.root_path, self.name, new_xy_log_file_name) new_xy_file_name = os.path.join(self.root_path, self.name, new_xy_file_name) CommonScrapyPipelineClass.append_row( spider_obj=self, key_list=["xy"], item_list=[amap_xy], csv_file_path_str=new_xy_file_name) CommonScrapyPipelineClass.append_row( spider_obj=self, key_list=["xy"], item_list=[this_row], csv_file_path_str=new_xy_log_file_name) def read_json_and_parse(self, response): self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, todo..." ) def check_trafficinfo_dict(self, trafficinfo_dict={}, status_int=-1, infocode_int=-1, center_xy_index_int=-1): """ 有可能得到下面的空结果;这个时候需要比对历史记录 {"status":"1","info":"OK","infocode":"10000","trafficinfo":{"description":[],"evaluation":{"expedite":[],"congested":[],"blocked":[],"unknown":[],"status":[],"description":[]},"roads":[]}} """ if not isinstance(trafficinfo_dict, dict) or 1 > len(trafficinfo_dict): return False if 1 != status_int or 10000 != infocode_int: return False edges = self.rectangle_list[ center_xy_index_int] if center_xy_index_int in range( len(self.rectangle_list)) else "" center_xy = self.edges_of_center_xy_dict[ edges] if edges in self.edges_of_center_xy_dict.keys() else "" # if "roads" not in trafficinfo_dict.keys() or not isinstance( trafficinfo_dict["roads"], list ) or 1 > len( trafficinfo_dict["roads"] ): # 比对历史记录 # if center_xy in self.xy_seen_dict.keys() and 0 == int(self.xy_seen_dict[center_xy]): # return True # 0表示该xy已经请求过3次,都返回空 # elif center_xy in self.xy_seen_dict.keys() and 0 > int(self.xy_seen_dict[center_xy]): # if -3 == int(): # self.xy_seen_dict[center_xy] = 0 # -1, -2, -3分别表示第1、2、3次请求返回空 # else: # self.xy_seen_dict[center_xy] -= 1 # self.xy_seen_updated_bool = True # elif center_xy not in self.xy_seen_dict.keys(): # self.xy_seen_dict[center_xy] = -1 # self.xy_seen_updated_bool = True # return False # 经过测试,上述方案会产生大量请求 if center_xy not in self.xy_seen_dict.keys() or len( trafficinfo_dict["roads"]) > int(self.xy_seen_dict[center_xy]): self.xy_seen_dict[center_xy] = len(trafficinfo_dict["roads"]) self.xy_seen_updated_bool = True return True def parse_json(self, response): status, infocode, message, result_dict = self.save_json( response=response, page_type="json") now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") meta_dict = response.meta center_xy_index = int( meta_dict["center_xy_index"] ) if "center_xy_index" in meta_dict.keys() else -1 if self.check_trafficinfo_dict(trafficinfo_dict=result_dict, status_int=status, infocode_int=infocode, center_xy_index_int=center_xy_index): loader = ItemLoader(item=TrafficItem(), response=response) loader = self.load_items_into_loader(loader=loader, text=result_dict, url=response.url, now=now) yield loader.load_item() else: edges = self.rectangle_list[ center_xy_index] if center_xy_index in range( len(self.rectangle_list)) else "" center_xy = self.edges_of_center_xy_dict[ edges] if edges in self.edges_of_center_xy_dict.keys() else "" center_xy_index = -1 error_msg = f"redo request from {response.url} for {center_xy} because status == {status}, infocode == {infocode}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) meta_dict["redo_counter"] += 1 yield scrapy.Request(url=response.url, callback=self.parse_json, meta=meta_dict, dont_filter=True) if -1 < center_xy_index: received_all_reponses_per_batch_bool = self.check_this_center_xy( center_xy_index_int=center_xy_index) print( f"received_all_reponses_per_batch_bool == {received_all_reponses_per_batch_bool}; center_xy_index = {center_xy_index}" ) # get data again after 5 minutes if self.request_counter < self.maximal_requests_of_one_crontab_process and received_all_reponses_per_batch_bool: while (self.check_time_interval()): time.sleep(10) self.request_counter += 1 now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") self.logger.info( f" requesting amap at {now} ( {self.request_counter} of { self.maximal_requests_of_one_crontab_process } )" ) self.get_one_batch_urls() meta_dict = { "redo_counter": 0, } for index, one_url in enumerate(self.urls): meta_dict["center_xy_index"] = index self.logger.info(f"{index}: requesting {one_url} ") yield scrapy.Request(url=one_url, callback=self.parse_json, meta=meta_dict, dont_filter=True) def check_time_interval(self): if time.time() - self.last_batch_request_timestamp_float > float( self.interval_between_requests): return False return True def check_this_center_xy(self, center_xy_index_int=-1): if center_xy_index_int not in range(len(self.rectangle_list)): return True # 4 minutes have passed, just return True if time.time() - self.last_batch_request_timestamp_float > 240.0: temp_list = [] for one_edge in self.last_batch_request_list: if one_edge in self.edges_of_center_xy_dict.keys(): temp_list.append(self.edges_of_center_xy_dict[one_edge]) self.logger.error( f"after 4 minutes, there are still {len(self.last_batch_request_list)} waiting for response: {temp_list} " ) return True # remove current preset_route edges = self.rectangle_list[ center_xy_index_int] if center_xy_index_int in range( len(self.rectangle_list)) else "" if edges in self.last_batch_request_list: self.last_batch_request_list.remove(edges) if 1 > len(self.last_batch_request_list): return True print(f"len == {len( self.last_batch_request_list )}") # There are(is an) element(s) in self.last_batch_request_list return False def load_items_into_loader(self, loader=None, text={}, url="", now=""): loader.add_value("url", url) loader.add_value("project", self.settings.get("BOT_NAME")) loader.add_value("spider", self.name) loader.add_value("server", socket.gethostname()) loader.add_value("date", now) loader.add_value("content", str(text)) loader.add_value("page_type", "json") return loader def get_json_file_name(self, url_str="", status_int=-4): now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") rectangle = "" url_obj = parse.urlparse(url_str) if hasattr(url_obj, "query"): query_dict = parse.parse_qs(url_obj.query) if "rectangle" in query_dict.keys(): rectangle = query_dict["rectangle"] if isinstance(rectangle, list) and 0 < len(rectangle): rectangle = rectangle[0] rectangle = rectangle.strip("'") rectangle = rectangle.replace(";", "___") rectangle = rectangle.replace(",", "_") if 1 > len(rectangle): return "" return os.path.join( self.json_dir, f"{self.city_or_area_name}___{rectangle}___{status_int}___{now}.json" ) def save_json(self, response=None, page_type="json"): """ during this tryout running, we still save the json response.body. But we will NOT in future. """ status = -4 infocode = 0 result_dict = {} if response is None or not hasattr(response, "body") or not hasattr( response, "url") or not hasattr(response, "meta"): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object" ) return (-1, infocode, f"wrong response object", result_dict) file_path = "" if "json" == page_type: json_dict = json.loads(response.body) status = json_dict["status"] if "status" in json_dict.keys( ) else "404" result_dict = json_dict[ "trafficinfo"] if "trafficinfo" in json_dict.keys() else {} infocode = json_dict["infocode"] if "infocode" in json_dict.keys( ) else "" status = int(status) infocode = int(infocode) if isinstance( infocode, str) and 0 < len(infocode) else 0 file_path = self.get_json_file_name(url_str=response.url, status_int=status) else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, wrong parameter page_type == {page_type} from {response.url}" ) return (-2, infocode, f"page_type can ONLY be json", result_dict) return_msg = "written" if 0 < len(file_path): try: with open(file_path, "wb") as f: f.write(response.body) except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to write response.body from {response.url}" ) return (status, infocode, f"failed to write json file", result_dict) # not -3 return (status, infocode, return_msg, result_dict)
class PoibaiduSpider(scrapy.Spider): """ sys.exit code == 1 # missing input folder sys.exit code == 2 # missing CITY_LIST or missing input file(s) sys.exit code == 3 # classification file format error sys.exit code == 4 # already requested all xy points in city_list today sys.exit code == 5 # missing Baidu ak sys.exit code == 6 # Method make_request_uris of Class PoibaiduSpider, query_type can ONLY be 3 sys.exit code == 7 # Run out of all Baidu ak today! """ name = "poibaidu" root_path = "" log_dir = "" baidu_ak_list = [] debug = False save_every_response = False crawled_dir = "" json_dir = "" input_folder_name = "" output_folder_name = "" classification_filename = "" maximal_request_times = [] output_file_format = "json" base_uri = "" query_type = 3 query_type3edge = 0 lng_delta = 0 lat_delta = 0 baidu_status_code = {} run_purpose = None city_list = [] input_dir = "" bout = 0 category_level = 1 custom_settings = CommonClass.get_custom_settings_dict(spider=name) classification_dict = {} classification_dict_english_mapper = {} second_part_of_xy_filename = "2km_with_zero.txt" ak_pointer = 0 center_dict = {} request_scope = 2 # 检索结果详细程度。取值为1 或空,则返回基本信息;取值为2,返回检索POI详细信息 page_size = 20 # 百度API说最大值是每一次请求返回20条记录:http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-placeapi # scrapy housekeeping keys: housekeeping_key_list = [ "download_slot", "download_latency", "depth", "query", ] bad_ak_status = [ 4, 5, 210, 211, 302, ] def init_self_attributes(self): self.root_path = self.settings.get("PROJECT_PATH") self.log_dir = self.settings.get(name="LOG_DIR", default="") self.baidu_ak_list = self.settings.get(name="BAIDU_AK", default=[]) if 1 > len(self.baidu_ak_list): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, missing Baidu ak" ) sys.exit(5) self.debug = self.settings.get(name="PROJECT_DEBUG", default=False) self.save_every_response = self.settings.get( name="SAVE_EVERY_RESPONSE", default=False) self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="") self.json_dir = self.settings.get(name="SAVED_JSON", default="") self.input_folder_name = self.settings.get(name="INPUT_FOLDER_NAME", default="") self.output_folder_name = self.settings.get(name="OUTPUT_FOLDER_NAME", default="") self.classification_filename = self.settings.get( name="QUERY_CLASSIFICATION_FILENAME", default="") self.maximal_request_times = self.settings.get( name="MAXIMAL_REQUEST_TIMES", default=[]) self.output_file_format = self.settings.get(name="OUTPUT_FILE_FORMAT", default="json") self.base_uri = self.settings.get(name="BASE_URI", default="") self.query_type = self.settings.get(name="QUERY_TYPE", default=3) self.query_type3edge = self.settings.get(name="QUERY_TYPE3EDGE", default=1.1) # https://zhidao.baidu.com/question/138957118823573885.html # 北纬30度,应该是0.010402707553*edge # 北纬45度,应该是0.0127406627241*edge # 北纬60度,应该是0.01801801801802*edge self.lng_delta = 0.01167 * self.query_type3edge self.lat_delta = 0.009009009 * self.query_type3edge # 每一纬度是111公里 self.baidu_status_code = self.settings.get(name="BAIDU_STATUS_CODE", default={}) self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None) self.city_list = self.settings.get(name="CITY_LIST", default=[]) self.input_dir = os.path.join(self.root_path, self.name, self.input_folder_name) self.bout = self.settings.get(name="RUN_PURPOSE_BOUT", default=1) self.category_level = self.settings.get(name="NEED_LEVELS", default=1) self.classification_dict_english_mapper = self.settings.get( name="DATABASE_ENGLISH_CATEGORY_TABLE", default={}) def check_dirs_and_files(self): if not os.path.isdir(self.crawled_dir): os.makedirs(self.crawled_dir) if not os.path.isdir(self.json_dir): os.makedirs(self.json_dir) # check all files and dirs if not os.path.isdir(self.input_dir): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, input folder ({self.input_dir}) and input files are needed." ) sys.exit(1) temp_list = [] missed_input_file = [] for one_city in self.city_list: input_file_path = os.path.join( self.input_dir, f"{one_city}{self.second_part_of_xy_filename}") if os.path.isfile(input_file_path): temp_list.append(one_city) else: missed_input_file.append(one_city) self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, missing {input_file_path}" ) if 0 < len(missed_input_file): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, missing input files of {missed_input_file}" ) sys.exit(2) # self.city_list = temp_list if 1 > len(self.city_list): # errorMsg = f"Missing input files of {missed_input_file}" if 0 < len(missed_input_file) else f"please indicate which cities you want to request POIs" errorMsg = f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, please indicate which cities you want to request POIs" self.logger.error(errorMsg) sys.exit(2) def read_xy_file(self, city=""): """ return a list [] that including this city's xy points """ center = [] temp_list = [] if 1 > len(city): return center today = datetime.datetime.now().strftime("%Y%m%d") try: input_filename = f"{city}{self.second_part_of_xy_filename}" with open(os.path.join(self.input_dir, input_filename), 'r', encoding='utf-8') as f: for item in f.readlines()[1:]: center.append(tuple(item.strip().split( ",")[-5:])) # lng, lat, ok0, max_value, max_timestamp except Exception as ex: center = [] self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, cannot read xy_list file ({input_filename}) or requested xy points file ({input_filename}). Exception = {ex}" ) return center def parse_uri_query_to_dict(self, url="", only_these_keys=[], map_query_english=True): result_dict = {} query_part_list = url.split("?") if 2 == len(query_part_list): result_dict = parse.parse_qs(query_part_list[1]) for index, key in enumerate(result_dict): if 1 == len(result_dict[key]): result_dict[key] = result_dict[key][0] else: self.logger.warning( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, length of {len(result_dict[key])} is more than 1" ) if 0 < len(only_these_keys): temp_dict = {} for index, key in enumerate(result_dict): if key in only_these_keys: temp_dict[key] = result_dict[key] result_dict = temp_dict if "bounds" in result_dict.keys(): result_dict["bounds"] = result_dict["bounds"].replace(",", "_") if map_query_english and "query" in result_dict.keys(): if result_dict[ "query"] in self.classification_dict_english_mapper.keys(): result_dict["query"] = self.classification_dict_english_mapper[ result_dict["query"]] else: result_dict[ "query"] = f"unknown_english_name{random.randint(10000,99999)}" return result_dict def return_next_ak(self): self.ak_pointer += 1 if self.ak_pointer >= len(self.baidu_ak_list): # do not use == self.ak_pointer = 0 return self.baidu_ak_list[self.ak_pointer] def make_request_uris(self, query_type=3, exclude_requested_today=True): """ As of 20190529, ONLY 3 == query_type is coded 1 == query_type: "http://api.map.baidu.com/place/v2/search?query=ATM机&tag=银行®ion=北京&output=json&ak=您的ak" # requesting pois in one city 2 == query_type: "http://api.map.baidu.com/place/v2/search?query=银行&location=39.915,116.404&radius=2000&output=xml&ak=您的密钥" # requesting pois in one circle area 3 == query_type: "http://api.map.baidu.com/place/v2/search?query=银行&bounds=39.915,116.404,39.975,116.414&output=json&ak={您的密钥}" # requesting pois in one rectangle area 4 == query_type: "http://api.map.baidu.com/place/v2/detail?uid=435d7aea036e54355abbbcc8&output=json&scope=2&ak=您的密钥" # requesting pois at one location with detailed address """ base_uri = self.base_uri if 4 == query_type: base_uri = self.base_uri.replace("/search", "/detail") if 3 != query_type: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, argumnet query_type can ONLY be 3; 1/2/4 are under developing" ) sys.exit(6) today = datetime.datetime.now().strftime("%Y%m%d") urls = {} all_categories = [] if 1 == self.category_level: all_categories = self.classification_dict.keys() elif 2 == self.category_level: for index, level1key in enumerate(self.classification_dict): all_categories += self.classification_dict[level1key] elif 3 == self.category_level: all_categories = list(self.classification_dict.keys()) for index, level1key in enumerate(self.classification_dict): all_categories += self.classification_dict[level1key] for index, city in enumerate(self.center_dict): # read this city's log file to exclude today's requested points requested = [] finished_xy_filename = f"{city}_finished_xy_query_points_{today}.log" finished_file_path = os.path.join(self.log_dir, finished_xy_filename) if exclude_requested_today and os.path.isfile(finished_file_path): with open(finished_file_path, "r", encoding="utf-8") as log_file: for item in log_file.readlines(): value = item.strip().split(",") if 3 == len(value): requested.append( f"{value[0]}___{value[1]}___{value[2]}") excluded = [] for category in all_categories: city_category_dict = {} for item in self.center_dict[city]: lng, lat, ok0, max_value, max_timestamp = item requested_key = f"{lng}___{lat}___{category}" if exclude_requested_today and 0 < len(requested): if requested_key in requested: excluded.append(requested_key) continue lng, lat = float(lng), float(lat) lng_min = float("%.3f" % (lng - 0.5 * self.lng_delta)) lng_max = float("%.3f" % (lng + 0.5 * self.lng_delta)) lat_min = float("%.3f" % (lat - 0.5 * self.lat_delta)) lat_max = float("%.3f" % (lat + 0.5 * self.lat_delta)) bounds = f"{lat_min},{lng_min},{lat_max},{lng_max}" city_category_dict[ requested_key] = f"{base_uri}?query={category}&page_size={self.page_size}&page_num=0&scope={self.request_scope}&bounds={bounds}&output={self.output_file_format}&ak={self.return_next_ak()}" if 0 < len(city_category_dict): urls[f"{city}___{category}"] = city_category_dict if 0 < len(excluded): self.logger.info( f"{len(excluded)} requests have been excluded in City {city}: ({excluded})" ) return urls def make_point_request_from_500_by_500(self, url_fragment_list=[]): url = url_fragment_list[-1] bounds = "" xy_list = [] new_center = {} if 0 < len(url): result_dict = parse.parse_qs(url) if 0 < len(result_dict) and "bounds" in result_dict.keys(): bounds = result_dict["bounds"][0] if 0 < len(bounds): xy_list = bounds.split(",") if 4 == len(xy_list): # bounds=23.091,113.306,23.097,113.313 y_min = float(xy_list[0]) x_min = float(xy_list[1]) y_max = float(xy_list[2]) x_max = float(xy_list[3]) delta_x = int(1000 * (x_max - x_min)) delta_y = int(1000 * (y_max - y_min)) if 0 < delta_x and 0 < delta_y: for i in range(delta_x): for j in range(delta_y): key_x = "%.6f" % (x_min + i / 1000) key_y = "%.5f" % (y_min + j / 1000) x = "%.3f" % (x_min + i / 1000) y = "%.3f" % (y_min + j / 1000) x_plus_1 = "%.3f" % (x_min + (i + 1) / 1000) y_plus_1 = "%.3f" % (y_min + (j + 1) / 1000) new_center[ f"{key_x}___{key_y}"] = f"{y},{x},{y_plus_1},{x_plus_1}" return new_center def make_16_request_from_2km_by_2km(self, url_fragment_list=[]): new_center = {} lng, lat = float(url_fragment_list[0]), float(url_fragment_list[1]) if 0 < lng and 0 < lat: center_xy = [ lng, lat, ] new_center = self.get_center_xys_from_single_xy( center_xy, half_edge_seperator=2, query_type3edge=1.1) return new_center def get_center_xys_from_single_xy(self, center_xy=[], half_edge_seperator=2, query_type3edge=1.1): new_center = {} if not isinstance(half_edge_seperator, int): return new_center lng, lat = float(center_xy[0]), float(center_xy[1]) old_half_edge = float("%.3f" % (query_type3edge / 1.1)) xy_point_list = [] new_span = old_half_edge / half_edge_seperator lng_delta = 0.01167 * new_span lat_delta = 0.009009009 * new_span x_minimal = "%.6f" % (lng - (half_edge_seperator - 0.5) * lng_delta) y_minimal = "%.5f" % (lat - (half_edge_seperator - 0.5) * lat_delta) for i in range(half_edge_seperator * 2): for j in range(half_edge_seperator * 2): x = "%.6f" % (lng + i * lng_delta) y = "%.5f" % (lat + j * lat_delta) key = f"{x}___{y}" xy_point_list.append(key) for one in xy_point_list: temp_list = one.split("___") if 2 == len(temp_list): bound_string = self.get_small_bounds( center_xy=temp_list, query_type3edge=new_span * 1.2) # for smaller rectangle, we use 1.2 if 0 < len(bound_string): new_center[one] = bound_string return new_center def get_small_bounds(self, center_xy=[], query_type3edge=0.55): lng_delta = 0.01167 * query_type3edge lat_delta = 0.009009009 * query_type3edge lng, lat = float(center_xy[0]), float(center_xy[1]) lng_min = float("%.3f" % (lng - 0.5 * lng_delta)) lng_max = float("%.3f" % (lng + 0.5 * lng_delta)) lat_min = float("%.3f" % (lat - 0.5 * lat_delta)) lat_max = float("%.3f" % (lat + 0.5 * lat_delta)) if 0 < lng_min and 0 < lng_max and 0 < lat_min and 0 < lat_max: return f"{lat_min},{lng_min},{lat_max},{lng_max}" else: return "" def do_makeup_requests(self, query_type=3, bout=1, single_line=""): all_city_dict = {} if bout not in [ 1, 2, ] or 3 != query_type: return all_city_dict base_uri = self.base_uri if 4 == query_type: base_uri = self.base_uri.replace("/search", "/detail") if 0 == len(single_line): today = datetime.datetime.now().strftime("%Y%m%d") empty_file_dir = os.path.join(self.root_path, self.name, self.output_folder_name, f"{today}waiting4next") file_list = os.listdir(empty_file_dir) all_lines = [] for one_file in file_list: try: this_file_path = os.path.join(empty_file_dir, one_file) with open(this_file_path, 'r', encoding='utf-8') as f: for item in f.readlines(): all_lines.append(item) except Exception as ex: all_lines = [] self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, cannot read xy_list file ({this_file_path}). Exception = {ex}" ) else: all_lines = [single_line] new_center = {} for item in all_lines: url_fragment_list = item.strip().split("___") if 1 == bout: new_center = self.make_16_request_from_2km_by_2km( url_fragment_list) elif 2 == bout: new_center = self.make_point_request_from_500_by_500( url_fragment_list) category = url_fragment_list[2] city = url_fragment_list[3] city_category_key = f"{city}___{category}" for index, key in enumerate(new_center): temp_list = key.split("___") if 2 == len(temp_list): requested_key = f"{temp_list[0]}___{temp_list[1]}___{category}" bounds = new_center[key] temp_dict = {} if city_category_key in all_city_dict.keys(): temp_dict = all_city_dict[city_category_key] temp_dict[ requested_key] = f"{base_uri}?query={category}&page_size={self.page_size}&page_num=0&scope={self.request_scope}&bounds={bounds}&output={self.output_file_format}&ak={self.return_next_ak()}" all_city_dict[city_category_key] = temp_dict return all_city_dict def start_requests(self): self.init_self_attributes() self.check_dirs_and_files() self.read_classification_file() self.center_dict = {} for city in self.city_list: center_list = self.read_xy_file(city=city) if 0 < len(center_list): self.center_dict[city] = center_list if 1 > len(self.center_dict): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, you already requested all xy points in {self.city_list} today." ) sys.exit(4) if "REDO_OVER400_POIS" == self.run_purpose: url_dict = self.do_makeup_requests(query_type=3, bout=self.bout, single_line="") else: url_dict = self.make_request_uris(query_type=3, exclude_requested_today=True) callback_func = self.parse_json if self.debug: callback_func = self.do_nothing_for_debug meta_dict = {} for index, key in enumerate(url_dict): temp_list = key.split("___") if 2 == len(temp_list): meta_dict = { "city": temp_list[0], "category": temp_list[1], "page_num": 0, } for inner_index, center_xy in enumerate(url_dict[key]): one_url = url_dict[key][center_xy] temp_list = center_xy.split("___") if 3 == len(temp_list): meta_dict["center_x"] = temp_list[0] meta_dict["center_y"] = temp_list[1] if "REDO_OVER400_POIS" == self.run_purpose: self.logger.info( f"requesting {one_url}; meta = {meta_dict}") else: self.logger.info(f"requesting {one_url}") yield scrapy.Request(url=one_url, callback=callback_func, meta=meta_dict, dont_filter=True) else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {center_xy} error in url_dict[key] ({len(url_dict[key])})" ) else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {key} error in url_dict ({len(url_dict)})" ) continue def do_nothing_for_debug(self, response): pass def parse_json(self, response): status, message = self.save_json(response=response, page_type="json") callback_func = self.parse_json url = response.url today = datetime.datetime.now().strftime("%Y%m%d") if status in [ -1, -2, -3, ]: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, wrong parameter(s) have passed to self.save_json!" ) elif 404 == status: meta_dict = self.request_counter_and_action(response=response) if 0 < meta_dict["request_counter"]: yield scrapy.Request(url=response.url, callback=callback_func, meta=meta_dict, dont_filter=True) elif status in [ 2, 3, ]: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to request {response.url} using wrong parameters or verification error(s)" ) elif status in self.bad_ak_status: self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to request {response.url}; run out of Baidu Ak quota or wrong settings; status code {status}" ) temp_list = message.split("___") if 4 == len(temp_list): bad_ak = temp_list[3] if 0 < len(bad_ak) and bad_ak in self.baidu_ak_list: self.baidu_ak_list.remove(bad_ak) if 1 > len(self.baidu_ak_list): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, run out of all Baidu ak today!" ) sys.exit(7) else: yield scrapy.Request(url=response.url, callback=callback_func, meta=response.meta_dict, dont_filter=True) else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, self.save_json did NOT pass the bad ak back!" ) elif 0 == status: temp_list = message.split("___") if 3 == len(temp_list): page_num = int(temp_list[1]) total = int(temp_list[2]) will_divide_request = False if 400 <= total: meta_dict = {} if hasattr(response, "meta"): meta_dict = response.meta needed_keys = [ "city", "center_x", "center_y", ] city = meta_dict["city"] if "city" in meta_dict.keys( ) else "" center_x = meta_dict[ "center_x"] if "center_x" in meta_dict.keys() else "" center_y = meta_dict[ "center_y"] if "center_y" in meta_dict.keys() else "" query = meta_dict[ "category"] if "category" in meta_dict.keys() else "" content = f"{center_x}___{center_y}___{query}___{city}___{page_num}___{total}___{url}" # begin to request 16 times for 1 == bout or 0.001 step for 2 == bout bout = int( meta_dict["bout"]) if "bout" in meta_dict.keys() else 1 if 3 > bout: url_dict = self.do_makeup_requests(query_type=3, bout=bout, single_line=content) for index, key in enumerate(url_dict): temp_list = key.split("___") if 2 == len(temp_list): meta_dict = { "city": temp_list[0], "category": temp_list[1], "page_num": 0, "bout": bout + 1, } for inner_index, center_xy in enumerate( url_dict[key]): one_url = url_dict[key][center_xy] temp_list = center_xy.split("___") if 3 == len(temp_list): meta_dict["center_x"] = temp_list[0] meta_dict["center_y"] = temp_list[1] self.logger.info( f"requesting {one_url}; meta = {meta_dict}" ) yield scrapy.Request( url=one_url, callback=self.parse_json, meta=meta_dict, dont_filter=True) else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {center_xy} error in url_dict[key] ({len(url_dict[key])})" ) else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {key} error in url_dict ({len(url_dict)})" ) continue will_divide_request = True else: xy_over400_filename = f"{city}_over400_xy_{today}.log" self.write_log(content=content, logfilename=xy_over400_filename, content_only=True) will_divide_request = False # directly save this json file even it contains more than 400 POIs if not will_divide_request: json_dict = json.loads(response.body) result_list = json_dict[ "results"] if "results" in json_dict.keys() else [] this_page_pois_list = [] for one_poi in result_list: this_poi_dict = self.process_one_baidu_poi_json_dict( json_dict=one_poi) this_page_pois_list.append(this_poi_dict) housekeeping_dict = {} meta_dict = {} if hasattr(response, "meta"): meta_dict = response.meta for one_key in self.housekeeping_key_list: housekeeping_dict[one_key] = meta_dict[ one_key] if one_key in meta_dict.keys() else "" housekeeping_dict["query"] = meta_dict[ "category"] if "category" in meta_dict.keys() else "" # yield to pipeline try: for one_poi in this_page_pois_list: loader = ItemLoader(item=PoibaiduItem(), response=response) loader = self.load_items_into_loader( loader=loader, one_poi_dict=one_poi, housekeeping_dict=housekeeping_dict, url=url) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}" ) last_page = math.ceil(total / self.page_size) - 1 if page_num < last_page and hasattr(response, "meta"): meta_dict["page_num"] = page_num + 1 url = url.replace(f"page_num={page_num}", f"page_num={page_num+1}") yield scrapy.Request(url=url, callback=callback_func, meta=meta_dict, dont_filter=True) elif page_num < last_page: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, missing response.meta while requesting {url} (last_page == {last_page})" ) elif page_num >= last_page: # this one shall be else:, but we just add one more condition # write separate log before yielding to pipeline center_x = meta_dict[ "center_x"] if "center_x" in meta_dict.keys( ) else "" center_y = meta_dict[ "center_y"] if "center_y" in meta_dict.keys( ) else "" query = housekeeping_dict["query"] if 0 < len(meta_dict["city"]) and 0 < len( center_x) and 0 < len(center_y): city = meta_dict["city"] finished_xy_filename = f"{city}_finished_xy_query_points_{today}.log" self.write_log( content=f"{center_x},{center_y},{query}", logfilename=finished_xy_filename, content_only=True) # to have smaller I/O, we do NOT inlcude page_num and total_page here. Baidu API Server seems to be stronger than QQ else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, self.save_json did NOT pass correct message ({message})!" ) def load_items_into_loader(self, loader=None, one_poi_dict={}, housekeeping_dict={}, url=""): # record housekeeping fields for index, key in enumerate(housekeeping_dict): loader.add_value(key, housekeeping_dict[key]) loader.add_value("url", url) loader.add_value("project", self.settings.get("BOT_NAME")) loader.add_value("spider", self.name) loader.add_value("server", socket.gethostname()) loader.add_value("date", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) # record all fields for database table(s) loader.add_value("content", str(one_poi_dict)) loader.add_value("page_type", "json") return loader def print_attributes(self): for one in dir(self): if not callable(getattr(self, one)) and -1 == one.find("__"): self.logger.info(f"{one} ==> {getattr(self, one)}") def read_classification_file(self): classification_dict = {} classification_file_path = os.path.join(self.input_dir, self.classification_filename) try: with open(classification_file_path, 'r', encoding='utf-8') as f: for item in f.readlines(): temp_list = item.strip().split(":") if 2 == len(temp_list): value_list = temp_list[1].split( "、") if -1 < temp_list[1].find("、") else [ temp_list[1] ] classification_dict[temp_list[0]] = value_list else: raise FileFormatException except FileFormatException as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {item} cannot be splitted into 2 by a colon. Wrong format in File {classification_file_path}. Exception = {ex}" ) sys.exit(3) except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, cannot read classification file ({classification_file_path}). Exception = {ex}" ) sys.exit(3) self.classification_dict = classification_dict def extract_response_meta_to_dict(self, response=None): city = "" category = "" page_num = -1 if hasattr(response, "meta") and "city" in response.meta.keys(): city = response.meta["city"] if hasattr(response, "meta") and "category" in response.meta.keys(): category = response.meta["category"] if hasattr(response, "meta") and "page_num" in response.meta.keys(): page_num = int(response.meta["page_num"]) return (city, category, page_num) def save_json(self, response=None, page_type="json"): if response is None or not hasattr(response, "body") or not hasattr( response, "url"): return (-1, f"wrong response format") city = "" category = "" page_num = -1 status = 404 total = "0" bad_baidu_ak = False uri_query_dict = {} if "json" == page_type: json_dict = json.loads(response.body) status = json_dict["status"] if "status" in json_dict.keys( ) else "404" total = json_dict["total"] if "total" in json_dict.keys() else "0" only_these_keys = [ "page_num", "query", "bounds", ] if int(status) in self.bad_ak_status: only_these_keys.append("ak") bad_baidu_ak = True uri_query_dict = self.parse_uri_query_to_dict( url=response.url, only_these_keys=only_these_keys, map_query_english=True) city, category, page_num = self.extract_response_meta_to_dict( response=response) page_num = uri_query_dict["page_num"] category = uri_query_dict["query"] bounds = uri_query_dict["bounds"] file_path = os.path.join( self.json_dir, f"{city}___{category}___{bounds}___{page_num}___{total}___{status}.json" ) status = int(status) else: return (-2, f"page_type can ONLY be json") try: with open(file_path, 'wb') as f: f.write(response.body) except Exception as ex: self.logger.warning( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to write response.body from {response.url}" ) return (-3, f"failed to write json file") else: return_msg = f"# {page_num} of {total} pages is requested___{page_num}___{total}" if bad_baidu_ak: bad_ak = uri_query_dict["ak"] if "ak" in uri_query_dict.keys( ) else "" return_msg = f"# {page_num} of {total} pages is requested___{page_num}___{total}___{bad_ak}" return (status, return_msg) def request_counter_and_action(self, response=None): return_dict = {} if hasattr(response, "meta"): return_dict = response.meta request_counter = 0 request_pointer = 0 if hasattr(response, "meta") and "request_pointer" in response.meta.keys(): request_pointer = int(response.meta["request_pointer"]) del return_dict["request_pointer"] if hasattr(response, "meta") and "request_counter" in response.meta.keys(): request_counter = int(response.meta["request_counter"]) del return_dict["request_counter"] if request_pointer < len(self.maximal_request_times): self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, request_counter == {request_counter}; request_pointer == {request_pointer} for the last request from {response.url}" ) if request_counter < self.maximal_request_times[request_pointer]: return_dict["request_counter"] = request_counter + 1 return_dict["request_pointer"] = request_pointer return return_dict else: return_dict["request_counter"] = 1 return_dict["request_pointer"] = request_pointer + 1 return return_dict else: today = datetime.datetime.now().strftime("%Y%m%d") self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {self.maximal_request_times} requests have been sent but ONLY empty response.body received from {response.url}" ) self.write_log(content=response.url, logfilename=f"missed_uris{today}.txt", content_only=True) return_dict["request_counter"] = -1 return_dict["request_pointer"] = request_pointer return return_dict def process_one_baidu_poi_json_dict(self, json_dict={}): return_dict = {} key_list = [ "name", "address", "province", "city", "area", "telephone", "uid", "street_id", "detail", ] for one_key in key_list: if one_key in json_dict.keys(): return_dict[one_key] = json_dict[one_key] has_item = True else: return_dict[one_key] = "" # process "detail_info", "tag" and "type" if "detail_info" in json_dict.keys(): return_dict["detail_info"] = str(json_dict["detail_info"]) if "type" in json_dict["detail_info"].keys(): return_dict["type"] = json_dict["detail_info"]["type"] if "tag" in json_dict["detail_info"].keys(): return_dict["tag"] = json_dict["detail_info"]["tag"] else: return_dict["detail_info"] = str({}) if "type" not in return_dict.keys(): return_dict["type"] = "" if "tag" not in return_dict.keys(): return_dict["tag"] = "" # process "lat" and "lng" if "location" in json_dict.keys(): location = json_dict["location"] if "lat" in location.keys() and "lng" in location.keys(): return_dict["lat"] = location["lat"] return_dict["lng"] = location["lng"] if "lat" not in return_dict.keys(): return_dict["lat"] = "" if "lng" not in return_dict.keys(): return_dict["lng"] = "" return return_dict def write_log(self, content=None, logfilename=None, content_only=False): if content is not None and 0 < len(content): today = datetime.datetime.now().strftime("%Y%m%d") if logfilename is None: logfilename = f"{self.name}{today}.log" try: with open(os.path.join(self.log_dir, logfilename), 'a', encoding='utf-8') as f: if content_only: info = f"{str(content)}\n" else: info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n" f.write(info) return 1 except Exception as ex: return 0 return -1
class Land3fangSpider(scrapy.Spider): """ 在分布式scrapyd部署之前,为了起多个fangesf进程而采取的临时措施(fangesfp2是本套代码的一个拷贝)。 sys.exit code == 1 # wrong or missing RUN_PURPOSE sys.exit code == 2 # wrong or missing CRAWLED_DIR, SAVED_DETAIL_HTML, or SAVED_GAODE_JASON sys.exit code == 3 # fail to get proxy's ip sys.exit code == 4 # wrong city_code On 20190730 Peter writes this spider upon requests """ name = "land3fang" root_path = "" log_dir = "" resume_break_point_detailed_file_name = "crawled_detailed_html.log" resume_break_point_list_file_name = "crawled_list_html.log" crawled_list_url_list = [] crawled_detailed_url_list = [] debug = False city_list = [] city_name_dict = {} run_purpose = None save_every_response = False overwrite_today = "" crawled_dir = "" saved_html_dir = "" over34_filename = "" custom_settings = CommonClass.get_custom_settings_dict(spider=name) proxy_ip_dict = {} min_proxy_ip_life_time = 6 max_proxy_ip_life_time = 180 use_proxy = False proxy_agent = "" cookie_string = "" cookie_dict = {} def init_self_attributes(self): self.root_path = self.settings.get("PROJECT_PATH") self.log_dir = self.settings.get(name="LOG_DIR", default="") self.debug = self.settings.get(name="PROJECT_DEBUG", default=False) self.city_list = self.settings.get("CITY_LIST", default=[]) if 1 > len(self.city_list) and "city" == self.city_name_for_districts: self.logger.error(f"missing CITY_LIST ({self.city_list}) setting") sys.exit(1) self.city_name_dict = self.settings.get("CITY_NAME_DICT", default={}) self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None) if self.run_purpose is None: self.logger.error( f"missing RUN_PURPOSE ({self.run_purpose}) setting") sys.exit(2) self.save_every_response = self.settings.get( name="SAVE_EVERY_RESPONSE", default=False) self.overwrite_today = self.settings.get("OVERWRITE_TODAY", default="") if not hasattr(self, "overwrite_today") or 1 > len( self.overwrite_today) or self.overwrite_today is None: self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d") # set all paths self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="") self.saved_html_dir = self.settings.get(name="SAVED_HTML", default="") if 1 > len(self.crawled_dir) or 1 > len(self.saved_html_dir): error_msg = f"missing CRAWLED_DIR ({self.crawled_dir}), SAVED_HTML ({self.saved_html_dir}) setting(s)" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) sys.exit(3) self.over34_filename = self.settings.get(name="OVER34_LOG_FILENAME", default="") self.min_proxy_ip_life_time = self.settings.get( name="MIN_PROXY_LIFE_SPAN", default=6) self.max_proxy_ip_life_time = self.settings.get( name="MAX_PROXY_LIFE_SPAN", default=180) self.use_proxy = self.settings.get(name="HTTPPROXY_ENABLED", default=False) self.proxy_agent = self.settings.get(name="PROXY_AGENT", default="") self.cookie_string = self.settings.get(name="COOKIE_STRING", default="") self.cookie_jar = CookieJar() def make_dirs(self): # even cache is used, we save all html files; here we make these 3 dirs if they do not exist if not os.path.isdir(self.crawled_dir): os.makedirs(self.crawled_dir) if not os.path.isdir(self.saved_html_dir): os.makedirs(self.saved_html_dir) def proxy_ip_pool(self): """ 迅联错误码10000 提取过快,请至少5秒提取一次 """ if "DRAGONFLY" == self.proxy_agent: return CommonClass.get_proxies(proxy_dict={}) now = time.time() need_new_proxy = False if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict): need_new_proxy = True elif "expire" not in self.proxy_ip_dict.keys(): need_new_proxy = True elif now + 3 > self.proxy_ip_dict["expire"]: need_new_proxy = True if need_new_proxy: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 1 > len(proxies_dict): return self.proxy_ip_dict # still return the old ip dict or {} proxies_dict["expire"] = now + random.randint( self.min_proxy_ip_life_time, self.max_proxy_ip_life_time) # set ip life time self.proxy_ip_dict = proxies_dict return self.proxy_ip_dict def read_crawled_urls(self): resume_break_point_detailed_file_path = os.path.join( self.log_dir, self.resume_break_point_detailed_file_name) try: with open(resume_break_point_detailed_file_path, "r", encoding="utf-8") as log_file: self.crawled_detailed_url_list = log_file.readlines() while "" in self.crawled_detailed_url_list: self.crawled_detailed_url_list.remove("") except Exception as ex: error_msg = f"fail to read {resume_break_point_detailed_file_path}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) # for list pages, do not use this [] to exclude seen urls def start_requests(self): self.init_self_attributes() self.make_dirs() self.read_crawled_urls() if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request(url=url, callback=self.read_and_parse) elif "PRODUCTION_RUN" == self.run_purpose: urls = [ # 广州 "https://land.3fang.com/market/440100__1______1_1_1.html", # 住宅用地: 26页 "https://land.3fang.com/market/440100__2______1_1_1.html", # 商业/办公用地: 17页 "https://land.3fang.com/market/440100__3_2__0_100000__1_1_1.html", # 工业用地, 已成交, 10万平米以下: 32页 "https://land.3fang.com/market/440100__3_2__100000_500000__1_1_1.html", # 工业用地, 已成交, 10-50万平米: 4页 "https://land.3fang.com/market/440100__3_2__500000_100000000__1_1_1.html", # 工业用地, 已成交, 50万平米以上: 1页 "https://land.3fang.com/market/440100__3_1_____1_1_1.html", # 工业用地, 未成交: 1页 "https://land.3fang.com/market/440100__3_3_____1_1_1.html", # 工业用地, 流拍: 7页 "https://land.3fang.com/market/440100__4______1_1_1.html", # 其他用地: 4页 # # 佛山 "https://land.3fang.com/market/440600__1_1_____1_1_1.html", # 住宅用地, 未成交: 8页 "https://land.3fang.com/market/440600__1_2__0_5000__1_1_1.html", # 住宅用地, 已成交, 5千平米以下: 33页 "https://land.3fang.com/market/440600__1_2__5000_100000__1_1_1.html", # 住宅用地, 已成交, 5千到10万平米: 29页 "https://land.3fang.com/market/440600__1_2__100000_100000000__1_1_1.html", # 住宅用地, 已成交, 10万平米以上: 6页 "https://land.3fang.com/market/440600__1_3_____1_1_1.html", # 住宅用地, 流拍: 3页 "https://land.3fang.com/market/440600__2______1_1_1.html", # 商业用地: 19页 "https://land.3fang.com/market/440600__3_1_____1_1_1.html", # 工业用地, 未成交: 6页 "https://land.3fang.com/market/440600__3_2__0_40000__1_1_1.html", # 工业用地, 已成交, 4万平米以下: 32页 "https://land.3fang.com/market/440600__3_2__40000_100000000__1_1_1.html", # 工业用地, 已成交, 4万平米以上: 12页 "https://land.3fang.com/market/440600__3_3_____1_1_1.html", # 工业用地, 流拍: 1页 "https://land.3fang.com/market/440600__4______1_1_1.html", # 其他用地: 3页 ] meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } if self.use_proxy: proxies_dict = self.proxy_ip_pool() if 1 > len(proxies_dict): sys.exit(3) meta_dict["proxy"] = proxies_dict["http"] cookie_dict = dict([ pair.split("=", 1) for pair in self.cookie_string.split("; ") ]) self.cookie_dict = cookie_dict for url in urls: url_object = parse.urlparse(url) path_list = url_object.path.split("/") for one in path_list: if -1 == one.find(".html"): continue city_name = "" city_code_list = one.split("_") city_code = int( city_code_list[0]) if 0 < len(city_code_list) else 0 if 0 < city_code and str( city_code) in self.city_name_dict.keys(): city_name = self.city_name_dict[str(city_code)] if 1 > len(city_name): error_msg = f"{city_code} is NOT in self.city_name_dict.keys() ({self.city_name_dict.keys()})" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) sys.exit(4) break meta_dict["city"] = city_name # cookie_dict = self.change_cookies( cookie_dict ) yield scrapy.Request(url=url, cookies=cookie_dict, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) # yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True ) elif "READ_CSV_AND_REDO" == self.run_purpose: english_city_name = { "佛山": "foshan", "广州": "guangzhou", } filename = "tudi_201808.csv" csv_file_path = os.path.join(self.crawled_dir, filename) url_list = [] city_list = [] try: with open(csv_file_path, newline="", encoding="utf-8") as csvfile: file_reader = csv.reader( csvfile) # , delimiter=' ', quotechar='|' for row in file_reader: if -1 < row[8].find("https:"): url_list.append(row[8]) city_list.append(row[13]) except Exception as ex: error_msg = f"cannot read csv file, Exception = {ex}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) meta_dict = { "page_type": "detailed", "total_pages": 1, } self.cookie_dict = dict([ pair.split("=", 1) for pair in self.cookie_string.split("; ") ]) if self.use_proxy: proxies_dict = self.proxy_ip_pool() meta_dict["proxy"] = proxies_dict["http"] for index, url in enumerate(url_list): chinese_city_name = city_list[index] meta_dict["city"] = english_city_name[chinese_city_name] yield scrapy.Request(url=url, cookies=self.cookie_dict, callback=self.parse_detailed_page, meta=meta_dict, dont_filter=True) break elif "CHECK_PROXY_IP" == self.run_purpose: now = int(time.time()) token = f"Guangzhou{str(now)}" m = hashlib.md5() m.update(token.encode(encoding='utf-8')) urls = [ f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}", ] if "DRAGONFLY" == self.proxy_agent: proxies_dict = CommonClass.get_proxies(proxy_dict={}) else: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 0 < len(proxies_dict): meta_dict = {"proxy": proxies_dict["http"]} for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug, meta=meta_dict) else: self.logger.error( f"Error! No proxy ip returns. {proxies_dict}") else: urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug) def change_cookies(self, cookie_dict={}): if "uservisitMarketitem" in cookie_dict.keys(): item_str = cookie_dict["uservisitMarketitem"] item_str = parse.unquote(item_str) item_list = item_str.split(",") new_str = "" for index, one in enumerate(item_list): if index > len(item_list) - 4: new_str += f",{one}" cookie_dict["uservisitMarketitem"] = parse.quote(new_str) return cookie_dict def get_total_pages(self, response=None): """ /market/440600__4______1_1_3.html """ total_pages = 0 if response is None: return total_pages all_link_list = response.xpath( "//div[@id='divAspNetPager']/a/@href").extract() total_page_list = [] for one in all_link_list: page = 0 temp_list = one.split("_") for one_fragment in temp_list: if -1 < one_fragment.find(".html"): page = one_fragment.replace(".html", "") total_page_list.append(int(page)) break if 1 > len(total_page_list): return 1 return max(total_page_list) def get_this_url_page(self, url_obj_path=""): """ https://land.3fang.com/market/440600__4______1_1_3.html https://land.3fang.com/market/440100__1______1_1_1.html """ url_list = url_obj_path.split("_") for one in url_list: if -1 < one.find(".html"): return int(one.replace(".html", "")) return 0 def make_html_file_name(self, url="", city="", page_type=""): """ https://land.3fang.com/market/440600__3_2__40000_100000000__1_1_1.html https://land.3fang.com/market/cee05e00-3263-4774-a898-9def16955cb4.html """ now = datetime.datetime.now() html_filename = "{}.html".format(now.strftime("%Y%m%d_%H%M%S")) today = now.strftime("%Y%m%d") url_obj = parse.urlparse(url) url_list = url_obj.path.split("/") for one in url_list: if -1 < one.find(".html"): html_filename = f"{city}__{page_type}__{one}" break return html_filename def get_page_and_district_area(self, url_list=[]): """ list page #2 or more, or including channels like: https://sz.esf.fang.com/house-a013080/ where a013080 stands for 深圳市龙华区 or https://sz.esf.fang.com/house-a013080-b014334 or https://sz.esf.fang.com/house-a013080-b02094/i372/ where b014334 stands for 深圳市龙华区大浪;house-a013080-b02094 stands for 观澜; house-a013080-b0350 stands for 龙华;house-a013080-b014333 stands for 民治 https://sz.esf.fang.com/house-a087-b0342/g22/ where g22 stands for 二居室; g21(一居),g23(三居),g24(四居),g25(五居),g299(五居以上) # this option is a multiple choice but this crawl will ONLY use single choice """ page = "1" district_area = "" bedrooms = 0 for index, key in enumerate(url_list): one_fragment = url_list[index] if -1 < one_fragment.find("i3") and -1 == one_fragment.find( "house-"): page = one_fragment[2:] elif -1 < one_fragment.find("house-") and -1 == one_fragment.find( "i3"): district_area = one_fragment.replace("house-", "") district_area = district_area.replace("-", "_") if index + 1 < len(url_list): next_fragment = url_list[index + 1] if -1 < next_fragment.find("g2"): last_part_of_fragment = next_fragment.replace("g2", "") if -1 < last_part_of_fragment.find("-i3"): temp_list = last_part_of_fragment.split("-i3") if 1 < len(temp_list): bedrooms = int(temp_list[0]) else: bedrooms = int(last_part_of_fragment) return (page, district_area, bedrooms) def save_html(self, response=None, save34=False): city = "" if response is None or not hasattr(response, "meta") or not hasattr( response, "body") or not hasattr(response, "url"): if hasattr(response, "url"): error_msg = f"fail to save response.body after requesting {response.url}; response has no body or meta attribute(s)" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return -1, city url = response.url meta_dict = response.meta page_type = "index" total_pages = 0 city = meta_dict["city"] if "city" in meta_dict.keys() else "" if "page_type" in meta_dict.keys(): page_type = meta_dict["page_type"] if "index" == page_type: if "total_pages" in meta_dict.keys(): total_pages = int(meta_dict["total_pages"]) if 0 == total_pages: total_pages = self.get_total_pages(response=response) if 34 < total_pages and not save34: return 101, city html_filename = self.make_html_file_name(url=url, city=city, page_type=page_type) html_file_path = os.path.join(self.saved_html_dir, html_filename) elif "detailed" == page_type: html_filename = self.make_html_file_name(url=url, city=city, page_type=page_type) html_file_path = os.path.join(self.saved_html_dir, html_filename) total_pages = 1001 try: with open(html_file_path, "wb") as f: f.write(response.body) except Exception as ex: error_msg = f"fail to write response.body into {html_file_path} after requesting {url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return -2, city else: if 1 > total_pages: error_msg = f"response.body saved after requesting {response.url}; but fail to extract total page number from response.body" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return total_pages, city # could be 34 when save34 = True def extract_link_list(self, response=None): link_list = response.xpath( '//dl[@id="landlb_B04_22"]/dd/div[@class="list28_text fl"]/h3/a/@href' ).extract() if 1 > len(link_list): error_msg = f"Fail to extract links from {response.url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return link_list def load_items_into_loader(self, loader=None, text={}, url=""): loader.add_value("content", str(text)) # , encoding="utf-8" loader.add_value("page_type", "detailed") # record housekeeping fields loader.add_value("url", url) loader.add_value("project", self.settings.get('BOT_NAME')) loader.add_value("spider", self.name) loader.add_value("server", socket.gethostname()) loader.add_value("date", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) return loader def parse_detailed_response_field(self, response=None, city=""): text = {} if response is None: return text if "READ_HTML" == self.run_purpose and not isinstance( response, Selector): return text information_div = response.xpath("//div[@id='printData1']") title = information_div.xpath( "./div[@class='tit_box01']/text()").extract_first(default="") land_id = information_div.xpath( "./div[@class='menubox01 mt20']/span[@class='gray2']/text()" ).extract_first(default="") province_city = information_div.xpath( "string(./div[@class='menubox01 p0515']/div[@class='fl'])" ).extract() province_city = "___".join(province_city) if 0 < len(title): text["title"] = title if 0 < len(land_id): text["land_id"] = land_id if 0 < len(province_city): text["province_city"] = province_city key1 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03']/text()" ).extract_first(default="") if "土地基本信息" == key1: basic_info = {} tr_list1 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03']/following-sibling::table[@class='tablebox02 mt10']/tbody/tr" ) for index, one_tr in enumerate(tr_list1): string_list = one_tr.xpath("string(.)").extract() td_list = [] for one_str in string_list: cleaned_str = CommonClass.clean_string(string=one_str, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) td_list.append(cleaned_str.strip('\r')) basic_info[index] = "___".join(td_list) text[key1] = basic_info key2 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/text()" ).extract_first(default="") if "土地交易信息" == key2: trade_info = {} tr_list2 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/following-sibling::div[@class='banbox']/table[@class='tablebox02 mt10']/tbody/tr" ) for index, one_tr in enumerate(tr_list2): string_list = one_tr.xpath("string(.)").extract() td_list = [] for one_str in string_list: cleaned_str = CommonClass.clean_string(string=one_str, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) td_list.append(cleaned_str.strip('\r')) trade_info[index] = "___".join(td_list) text[key2] = trade_info # 20190730 cannot get 土地评估结果, todo ... # evaluation_div = response.xpath("//div[@id='divpg']") # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" ) # if "土地评估结果" == key3: # evaluation_dict = {} # tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr") # for index, one_tr in enumerate( tr_list3 ): # this_td = one_tr.xpath("./td") # if this_td is None: # string_list = one_tr.xpath("string(./th)").extract() # else: # td_list = one_tr.xpath("./td") # string_list = [] # for one_td in td_list: # unit = one_td.xpath("./text()").extract_first( default= "" ) # amount = one_td.xpath("./span/text()").extract_first( default= "" ) # string_list.append( f"{amount}___{unit}" ) # # this_td_str_list = one_td.xpath("string(.)").extract() # # string_list.extend( this_td_str_list ) # td_th_list = [] # for one_str in string_list: # cleaned_str = CommonClass.clean_string( string = one_str, char_to_remove = [ '\xa0', '\n', '\t', ' ',] ) # td_th_list.append( cleaned_str.strip('\r') ) # evaluation_dict[index] = "___".join( td_th_list ) # text[key3] = evaluation_dict # evaluation_div = response.xpath("//div[@id='divpg']") # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" ) # if "土地评估结果" == key3: # evaluation_dict = {} # th_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr/th") # string_list = th_list3.xpath("string(.)").extract() # evaluation_dict["fields"] = "___".join( string_list ) # tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr") # row2 = tr_list3[1].xpath("./td") # row2string = "" # str1 = row2[0].xpath("./text()").extract_first( default= "" ) # str2 = row2[1].xpath("string(.)").extract() # str2 = "___".join( str2 ) # str3amount = response.xpath("//span[@id='scbj_bpgj']") # str3unit = row2[2].xpath("./text()").extract_first( default= "" ) # str4amount = response.xpath("//span[@id='scbj_bSumPrice']") # str4amount = str4amount.get() # str3amount = str3amount.get() # str4unit = row2[3].xpath("./text()").extract_first( default= "" ) # str5 = row2[4].xpath("./a/@href").extract_first( default= "" ) # evaluation_dict[str1] = f"{str2}___{str3amount} {str3unit}___{str4amount} {str4unit}___{str5}" # row3 = tr_list3[2].xpath("./td") # row3str = row3.xpath("string(.)").extract() # evaluation_dict["假设开发法"] = "___".join( row3str ) # text[key3] = evaluation_dict if 0 < len(text): text["city"] = city return text # {'fields': '\xa0___推出楼面价___评估楼面价___评估总价___操作', '市场比较法': '暂无 元/㎡___ 元/㎡___ 万元___ # /LandAssessment/b17ea17a-eefa-428b-8b53-461c2bdc67ea.html', '假设开发法': '假设开发法___暂无 元/㎡___元/㎡___万元___[进入评估报告]'} def log_for_picking_up_the_crawl_break_point(self, page_type="detailed", response=None): if "detailed" == page_type: resume_break_point_file_path = os.path.join( self.log_dir, self.resume_break_point_detailed_file_name) else: resume_break_point_file_path = os.path.join( self.log_dir, self.resume_break_point_list_file_name) try: with open(resume_break_point_file_path, "a") as f: f.write(f"{response.url}\n") except Exception as ex: error_msg = f"fail to write response.url into {resume_break_point_file_path}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) def parse_detailed_page(self, response=None): url = response.url result_obj = parse.urlparse(url) has_url_error = self.url_contains_error( result_obj_path=result_obj.path) if has_url_error: return False page_status, city = self.save_html(response=response, save34=True) text = self.parse_detailed_response_field(response=response, city=city) if isinstance(text, dict) and 0 < len(text): try: loader = ItemLoader(item=Land3fangItem(), response=response) loader = self.load_items_into_loader(loader=loader, text=text, url=url) self.log_for_picking_up_the_crawl_break_point( page_type="detailed", response=response) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}" ) def do_nothing_for_debug(self, response=None): self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}" ) # print( response.body ) # Inside Method request_proxy_ip of Class ProxyAgent, proxy server returns [{'IP': '49.87.226.131:10749'}] # b'{"REMOTE_ADDR":"49.87.226.131","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"49.87.226.131, 49.87.226.131"}' # 2019-06-20 16:28:55 [fangesf] INFO: Inside Method do_nothing_for_debug of Class FangesfSpider, # url = https://www.coursehelper.site/index/index/getHeaders?token=ad89558c89c3394167adbfd1484c8700 # 2019-06-20 16:28:55 [stdout] INFO: b'{"REMOTE_ADDR":"139.196.200.61","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"139.196.200.61, 139.196.200.61"}' def url_contains_error(self, result_obj_path=""): if 1 > len(result_obj_path): return False path_fragment_list = result_obj_path.split("/") if 1 > len(path_fragment_list): return False pass # do know any anticrawl methods yet return False def parse_list_page(self, response=None): """ https://land.3fang.com/market/440600__4______1_1_1.html """ result_obj = parse.urlparse(response.url) has_url_error = self.url_contains_error( result_obj_path=result_obj.path) if has_url_error: return False page_status, city = self.save_html(response=response, save34=False) if 1 > page_status: pass # -2, -1, 0: error_msg has been logged; just pass elif 0 < page_status and 35 > page_status: # 1 to 34 also means "index" == page_type link_list = self.extract_link_list(response=response) if self.debug: self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}; link_list = {link_list}" ) else: self.log_for_picking_up_the_crawl_break_point( page_type="index", response=response) new_url = f"{result_obj.scheme}://{result_obj.netloc}" this_cookie = self.cookie_jar.extract_cookies( response, response.request) print(this_cookie) # crawling vertically meta_dict = { "page_type": "detailed", "total_pages": 1, "city": city, } if self.use_proxy: proxies_dict = self.proxy_ip_pool() meta_dict["proxy"] = proxies_dict["http"] for one_link in link_list: if 0 != one_link.find('/'): one_link = f"/{one_link}" this_i_url = f"{new_url}{one_link}" if this_i_url in self.crawled_detailed_url_list: self.logger.info(f"previously crawled {this_i_url}") else: self.logger.info(f"requesting {this_i_url}") yield scrapy.Request(url=this_i_url, cookies=self.cookie_dict, callback=self.parse_detailed_page, meta=meta_dict, dont_filter=True) # crawling horizontally if 1 < page_status and 1 == self.get_this_url_page( url_obj_path=result_obj.path): meta_dict = response.meta meta_dict["total_pages"] = page_status if self.use_proxy: proxies_dict = self.proxy_ip_pool() meta_dict["proxy"] = proxies_dict["http"] for i in range(page_status - 1): new_path = result_obj.path new_path = new_path.replace("1.html", f"{i + 2}.html") this_i_url = f"{new_url}{new_path}" self.logger.info( f"requesting list page at {this_i_url}") yield scrapy.Request(url=f"{this_i_url}", cookies=self.cookie_dict, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif 101 == page_status: error_msg = f"101: todo ... " self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) elif 1001 == page_status: self.parse_detailed_page(response=response) # 1001 also means "detailed" == page_type # will never reach here because self.parse_detailed_page() is the callback method def read_and_parse(self, response=None): file_list = os.listdir(self.saved_html_dir) for one_file in file_list: if -1 == one_file.find("index"): temp_list = one_file.split("___") apt_id = 0 city = "" if 1 < len(temp_list): apt_id = temp_list[1] city = temp_list[0] url = f"https://{city}.esf.fang.com/chushou/3_{apt_id}.htm" # can also be 16_, 10_, and others # https://sz.esf.fang.com/chushou/3_218307566.htm html_file_path = os.path.join(self.saved_html_dir, one_file) if os.path.isfile(html_file_path): doc = None with open(html_file_path, 'rb') as f: # doc = f.read().decode('gb2312', 'ignore') doc = f.read().decode('utf-8', 'ignore') if doc is None: self.logger.error( f"Error: cannot read html file {html_file_path}.") continue response = Selector(text=doc, type="html") text = self.parse_detailed_response_field( response=response, city=city, apt_id=apt_id) try: response_for_items = TextResponse( url=url, status=200, body=bytes(doc, encoding="utf-8")) loader = ItemLoader(item=FangesfItem(), response=response_for_items) loader = self.load_items_into_loader(loader=loader, text=text, url=url) yield loader.load_item() except Exception as ex: self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, Exception = {ex}" ) if self.debug: break def write_log(self, content=None, logfilename=None, content_only=False): if content is not None and 0 < len(content): today = datetime.datetime.now().strftime("%Y%m%d") if logfilename is None: logfilename = f"{self.name}{today}.log" try: with open(os.path.join(self.log_dir, logfilename), 'a', encoding='utf-8') as f: if content_only: info = f"{str(content)}\n" else: info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n" f.write(info) return 1 except Exception as ex: return 0 return -1
class DirectionamapcoverageSpider(scrapy.Spider): """ sys.exit code == 2 # missing CITY_LIST or missing input file(s) sys.exit code == 3 # classification file format error sys.exit code == 4 # already requested all xy points in city_list today """ name = "directionamapcoverage" root_path = "" log_dir = "" # debug = False # save_every_response = False crawled_dir = "" json_dir = "" output_folder_name = "" # output_file_format = "json" # base_uri = "" # run_purpose = None custom_settings = CommonClass.get_custom_settings_dict(spider=name) # crontab will start a new process in every 2 hours; therefore in 1 day, the crontab will start 12 times maximal_requests_of_one_crontab_process = 23 interval_between_requests = 300 request_counter = 0 last_4_requests = {} urls = [ "https://restapi.amap.com/v3/direction/driving?origin=113.268029,22.923338&destination=113.3025,23.38575&extensions=all&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2", "https://restapi.amap.com/v3/direction/driving?origin=113.30508,23.38597&destination=113.268029,22.923338&extensions=all&output=json&key=470fdf698e3aab758d4cb026244f5194", "https://restapi.amap.com/v3/direction/driving?origin=113.268029,22.923338&destination=113.81424,22.62471&extensions=all&output=json&key=740f50c6fabd5801d0fad1cba62446d9", "https://restapi.amap.com/v3/direction/driving?origin=113.81276,22.62405&destination=113.268029,22.923338&extensions=all&output=json&key=4328d392605802de34406045b9701bb8", ] # 0 == 碧桂园总部到白云机场;1 == 白云机场到碧桂园总部;2 == 总部到宝安机场;3 == 宝安机场到总部 # https://restapi.amap.com/v3/direction/driving?origin=113.267982,22.92451&destination=113.307605,23.389929&extensions=all&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2 # https://restapi.amap.com/v3/direction/driving?origin=113.307605,23.389929&destination=113.267982,22.92451&extensions=all&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2 # 碧桂园总部 # 113.268029,22.923338 # 白云机场 # 113.3025,23.38575: T1航站楼国内出发 # 113.30508,23.38597: T1航站楼国内到达 # 宝安机场 # 113.81424,22.62471: T3航站楼国内出发 # 113.81276,22.62405: T3航站楼国内到达 # https://restapi.amap.com/v3/direction/driving?origin=113.267982,22.92451&destination=113.814829,22.633092&extensions=all&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2 # https://restapi.amap.com/v3/direction/driving?origin=113.814829,22.633092&destination=113.267982,22.92451&extensions=all&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2 # https://ditu.amap.com/dir?from%5Badcode%5D=440306&from%5Bname%5D=%E6%B7%B1%E5%9C%B3%E5%AE%9D%E5%AE%89%E5%9B%BD%E9%99%85%E6%9C%BA%E5%9C%BA&from%5Bid%5D=B02F37T239&from%5Bpoitype%5D=150104&from%5Blnglat%5D=113.81482900000003%2C22.633092&from%5Bmodxy%5D=113.815186%2C22.624847&to%5Bname%5D=%E7%A2%A7%E6%A1%82%E5%9B%AD%E6%80%BB%E9%83%A8&to%5Blnglat%5D=113.267982%2C22.92451&to%5Bid%5D=B0FFFVAF72&to%5Bpoitype%5D=120201&to%5Badcode%5D=440600&to%5Bmodxy%5D=113.269254%2C22.923768&type=car&policy=1 # https://www.amap.com/dir?from%5Bname%5D=%E7%A2%A7%E6%A1%82%E5%9B%AD%E6%80%BB%E9%83%A8&from%5Blnglat%5D=113.267982%2C22.92451&from%5Bid%5D=B0FFFVAF72-from&from%5Bpoitype%5D=120201&from%5Badcode%5D=440600&from%5Bmodxy%5D=113.269254%2C22.923768&to%5Bid%5D=B0FFG40CGO&to%5Bname%5D=%E5%B9%BF%E5%B7%9E%E7%99%BD%E4%BA%91%E5%9B%BD%E9%99%85%E6%9C%BA%E5%9C%BAT1%E8%88%AA%E7%AB%99%E6%A5%BC(F3%E5%9B%BD%E5%86%85%E5%87%BA%E5%8F%915%E5%8F%B7%E9%97%A8)&to%5Blnglat%5D=113.302846%2C23.385712&to%5Bmodxy%5D=113.302846%2C23.385712&to%5Bpoitype%5D=150105&to%5Badcode%5D=440114&type=car&policy=1 # https://www.amap.com/dir?from%5Bid%5D=B00140NZIQ&from%5Bname%5D=%E5%B9%BF%E5%B7%9E%E7%99%BD%E4%BA%91%E5%9B%BD%E9%99%85%E6%9C%BA%E5%9C%BA&from%5Blnglat%5D=113.307605%2C23.389929&from%5Bmodxy%5D=113.303722%2C23.385187&from%5Bpoitype%5D=150104&from%5Badcode%5D=440111&to%5Bname%5D=%E7%A2%A7%E6%A1%82%E5%9B%AD%E6%80%BB%E9%83%A8&to%5Blnglat%5D=113.267982%2C22.92451&to%5Bid%5D=B0FFFVAF72&to%5Bpoitype%5D=120201&to%5Badcode%5D=440600&to%5Bmodxy%5D=113.269254%2C22.923768&type=car&policy=1 def init_self_attributes(self): self.root_path = self.settings.get("PROJECT_PATH") self.log_dir = self.settings.get(name="LOG_DIR", default="") # self.debug = self.settings.get( name = "PROJECT_DEBUG", default=False ) # self.save_every_response = self.settings.get( name = "SAVE_EVERY_RESPONSE", default=False ) self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="") self.json_dir = self.settings.get(name="SAVED_JSON", default="") self.output_folder_name = self.settings.get(name="OUTPUT_FOLDER_NAME", default="") # self.output_file_format = self.settings.get( name = "OUTPUT_FILE_FORMAT", default="json" ) # self.base_uri = self.settings.get( name = "BASE_URI", default="" ) self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None) self.overwrite_today = self.settings.get(name="OVERWRITE_TODAY", default="") self.maximal_requests_of_one_crontab_process = self.settings.get( name="MAXIMAL_REQUESTS_OF_ONE_CRONTAB_PROCESS", default=23) self.interval_between_requests = self.settings.get( name="INTERVAL_BETWEEN_REQUESTS", default=300) def check_dirs_and_files(self): if not os.path.isdir(self.crawled_dir): os.makedirs(self.crawled_dir) if not os.path.isdir(self.json_dir): os.makedirs(self.json_dir) def start_requests(self): self.init_self_attributes() self.check_dirs_and_files() if "READ_JSON_AND_WRITE_CSV" == self.run_purpose: one_url = "https://blog.csdn.net/qq_37193537/article/details/78987949" callback_func = self.read_json_and_parse yield scrapy.Request(url=one_url, callback=callback_func, dont_filter=True) else: timestamp_float = time.time() self.last_4_requests = { "request_time": timestamp_float, "requested_index": [ 0, 1, 2, 3, ] } callback_func = self.parse_json for index, one_url in enumerate(self.urls): meta_dict = { "preset_route": index, # 0 == 碧桂园总部到白云机场;1 == 白云机场到碧桂园总部;2 == 总部到宝安机场;3 == 宝安机场到总部 "redo": 0, } self.logger.info(f"{index}: requesting {one_url} ") yield scrapy.Request(url=one_url, callback=callback_func, meta=meta_dict, dont_filter=True) def get_url_according_to_preset_route(self, preset_route=101): if preset_route in [ 0, 1, 2, 3, ]: return self.urls[preset_route] return "" def read_json_and_parse(self, response): file_list = os.listdir(self.json_dir) # route0___1___20190615_234522.json for one_file in file_list: temp_list = one_file.split("___") preset_route = 0 now = "" if 2 < len(temp_list): preset_route = temp_list[0] preset_route = preset_route.lstrip("route") preset_route = CommonClass.find_digits_from_str( string=preset_route, return_all=False) preset_route = int(preset_route) now = temp_list[2] now = now.rstrip(".json") url = self.get_url_according_to_preset_route( preset_route=preset_route) json_file_path = os.path.join(self.json_dir, one_file) if os.path.isfile(json_file_path): try: doc = None with open(json_file_path, "rb") as f: doc = f.read().decode("utf-8", "ignore") if doc is None: self.logger.error( f"Error: cannot read html file {json_file_path}." ) continue text_dict = self.extract_text_dict_from_response_body( body=doc, preset_route=preset_route, now=now) if 0 < len(text_dict): json_selector = Selector(text=doc, type=None) loader = ItemLoader(item=DirectionamapItem(), selector=json_selector) loader = self.load_items_into_loader( loader=loader, text=text_dict, url=url, now=now) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}" ) def extract_text_dict_from_response_body(self, body="", preset_route=101, now=""): text_dict = {} json_dict = json.loads(body) count = int(json_dict["count"]) # already 0 < count route_dict = json_dict["route"] if "route" in json_dict.keys() else {} paths = route_dict["paths"] if "paths" in route_dict.keys() else [] duration = 0 strategy = "速度最快" selected_path_steps = [] found_fastest = False if 1 < len(paths): for one_path in paths: temp_strategy = one_path[ "strategy"] if "strategy" in one_path.keys() else "" if -1 < temp_strategy.find("速度最快"): duration = int(one_path["duration"] ) if "duration" in one_path.keys() else 0 strategy = temp_strategy selected_path_steps = one_path[ "steps"] if "steps" in one_path.keys() else [] found_fastest = True break if 1 == len(paths) or (not found_fastest and 1 < len(paths)): duration = int( paths[0]["duration"]) if "duration" in paths[0].keys() else 0 strategy = paths[0]["strategy"] if "strategy" in paths[0].keys( ) else "" selected_path_steps = paths[0]["steps"] if "steps" in paths[ 0].keys() else [] text_dict = { "preset_route": preset_route, "strategy": strategy, "duration": duration, "count": count, "paths": len(paths), "now": now, "selected_path_steps": selected_path_steps, } return text_dict def parse_json(self, response): status, message = self.save_json(response=response, page_type="json") now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") preset_route = -1 if 1 == status: try: meta_dict = response.meta preset_route = int(meta_dict["preset_route"]) text_dict = self.extract_text_dict_from_response_body( body=response.body, preset_route=preset_route, now=now) loader = ItemLoader(item=DirectionamapItem(), response=response) loader = self.load_items_into_loader(loader=loader, text=text_dict, url=response.url, now=now) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}" ) if -1 == preset_route: if hasattr(response, "meta"): meta_dict = response.meta if "preset_route" in meta_dict.keys(): preset_route = int(meta_dict["preset_route"]) if -1 < preset_route: received_all_4_requests_bool = self.check_this_preset_route( preset_route=preset_route) if not received_all_4_requests_bool and "redo" in response.meta.keys( ): delayed_index_list = self.get_delayed_response_more_than_1_minute( ) if 0 < len(delayed_index_list): request_result_bool = self.redo_requests( redo=response.meta["redo"]) # get data again after 5 minutes if self.request_counter < self.maximal_requests_of_one_crontab_process and received_all_4_requests_bool: while (self.check_time_interval()): time.sleep(10) self.request_counter += 1 now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") self.logger.info( f" requesting amap at {now} ( {self.request_counter} of { self.maximal_requests_of_one_crontab_process } )" ) self.last_4_requests = { "request_time": time.time(), "requested_index": [ 0, 1, 2, 3, ] } callback_func = self.parse_json for index, one_url in enumerate(self.urls): meta_dict = { "preset_route": index, "redo": 0, } self.logger.info(f"{index}: requesting {one_url} ") yield scrapy.Request(url=one_url, callback=callback_func, meta=meta_dict, dont_filter=True) def check_time_interval(self): if "request_time" not in self.last_4_requests.keys() or not isinstance( self.last_4_requests["request_time"], float): return False if time.time() - self.last_4_requests["request_time"] > float( self.interval_between_requests): return False return True def redo_requests(self, redo=-1): urls = [] index_list = [] if 1 > len(self.last_4_requests["requested_index"]) or 0 > redo: return False for one_index in self.last_4_requests["requested_index"]: urls.append(self.urls[one_index]) index_list.append(one_index) now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") for index, one_url in enumerate(urls): meta_dict = { "preset_route": index_list[index], "redo": redo + 1, } self.logger.info( f"[{now}] redo {index_list[index]}: requesting {one_url} ") yield scrapy.Request(url=one_url, callback=self.parse_json, meta=meta_dict, dont_filter=True) def get_delayed_response_more_than_1_minute(self): if "requested_index" not in self.last_4_requests.keys( ) or not isinstance(self.last_4_requests["requested_index"], list): return [] if "request_time" not in self.last_4_requests.keys() or not isinstance( self.last_4_requests["request_time"], float): return [] if time.time() - self.last_4_requests["request_time"] > 60.0: return self.last_4_requests["requested_index"] return [] def check_this_preset_route(self, preset_route=-1): if preset_route not in [ 0, 1, 2, 3, ]: return True if "request_time" not in self.last_4_requests.keys() or not isinstance( self.last_4_requests["request_time"], float): return True if "requested_index" not in self.last_4_requests.keys( ) or not isinstance(self.last_4_requests["requested_index"], list): return True # 4 minutes have passed, just return True if time.time() - self.last_4_requests["request_time"] > 240.0: return True # remove current preset_route if preset_route in self.last_4_requests["requested_index"]: self.last_4_requests["requested_index"].remove(preset_route) if 1 > len(self.last_4_requests["requested_index"]): return True # There are(is a) element(s) in self.last_4_requests["requested_index"] return False def load_items_into_loader(self, loader=None, text={}, url="", now=""): loader.add_value("url", url) loader.add_value("project", self.settings.get("BOT_NAME", default="")) loader.add_value("spider", self.name) loader.add_value("server", socket.gethostname()) loader.add_value("date", now) loader.add_value("content", str(text)) loader.add_value("page_type", "json") return loader def save_json(self, response=None, page_type="json"): status = -4 if response is None or not hasattr(response, "body") or not hasattr( response, "url") or not hasattr(response, "meta"): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object" ) return (-1, f"wrong response object") meta_dict = response.meta preset_route = meta_dict[ "preset_route"] if "preset_route" in meta_dict.keys() else "" file_path = "" if "json" == page_type: json_dict = json.loads(response.body) status = json_dict["status"] if "status" in json_dict.keys( ) else "404" count = int( json_dict["count"]) if "count" in json_dict.keys() else 0 if 0 < count: now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") file_path = os.path.join( self.json_dir, f"route{preset_route}___{status}___{now}.json") status = int(status) else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, wrong parameter page_type == {page_type} from {response.url}" ) return (-2, f"page_type can ONLY be json") return_msg = "0 count" if 0 < len(file_path): try: with open(file_path, 'wb') as f: f.write(response.body) except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to write response.body from {response.url}" ) return (status, f"failed to write json file") return (status, return_msg)
class Shop58Spider(scrapy.Spider): """ sys.exit code == 1 # wrong or missing RUN_PURPOSE sys.exit code == 2 # wrong or missing CRAWLED_DIR, SAVED_DETAIL_HTML, or SAVED_GAODE_JASON sys.exit code == 3 # fail to get proxy's ip On 20190605 Peter writes this spider upon requests """ name = "shop58" root_path = "" log_dir = "" over70_filename = "" resume_break_point_detailed_file_name = "crawled_detailed_html.log" resume_break_point_list_file_name = "crawled_list_html.log" crawled_list_url_list = [] crawled_detailed_url_list = [] debug = False city_list = [] run_purpose = None save_every_response = False overwrite_today = "" crawled_dir = "" saved_html_dir = "" gaode_json_dir = "" csv_file_path = None custom_settings = CommonClass.get_custom_settings_dict(spider=name) proxy_ip_dict = {} min_proxy_ip_life_time = 6 max_proxy_ip_life_time = 180 use_proxy = False shop_area_uri_list = [ "0_20", "20_50", "50_100", "100_200", "200_500", "500_%2A", ] def init_self_attributes(self): self.root_path = self.settings.get("PROJECT_PATH") self.log_dir = self.settings.get(name="LOG_DIR", default="") self.over70_filename = self.settings.get(name="OVER70_LOG_FILENAME", default="") self.debug = self.settings.get(name="PROJECT_DEBUG", default=False) self.city_list = self.settings.get("CITY_LIST", default=[]) if 1 > len(self.city_list): self.logger.error(f"missing CITY_LIST ({self.city_list}) setting") sys.exit(1) self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None) if self.run_purpose is None: self.logger.error( f"missing RUN_PURPOSE ({self.run_purpose}) setting") sys.exit(2) self.save_every_response = self.settings.get( name="SAVE_EVERY_RESPONSE", default=False) self.overwrite_today = self.settings.get("OVERWRITE_TODAY", default="") if not hasattr(self, "overwrite_today") or 1 > len( self.overwrite_today) or self.overwrite_today is None: self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d") # set all paths self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="") self.saved_html_dir = self.settings.get(name="SAVED_HTML", default="") self.gaode_json_dir = self.settings.get(name="SAVED_GAODE_JASON", default="") self.csv_file_path = os.path.join( self.crawled_dir, f"shop58_{self.overwrite_today}.csv") if 1 > len(self.crawled_dir) or 1 > len( self.saved_html_dir) or 1 > len(self.gaode_json_dir): error_msg = f"missing CRAWLED_DIR ({self.crawled_dir}), SAVED_HTML ({self.saved_html_dir}), or SAVED_GAODE_JASON ({self.gaode_json_dir}) setting(s)" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) sys.exit(3) self.min_proxy_ip_life_time = self.settings.get( name="MIN_PROXY_LIFE_SPAN", default=6) self.max_proxy_ip_life_time = self.settings.get( name="MAX_PROXY_LIFE_SPAN", default=180) self.use_proxy = self.settings.get(name="HTTPPROXY_ENABLED", default=False) def make_dirs(self): # even cache is used, we save all html files; here we make these 3 dirs if they do not exist if not os.path.isdir(self.crawled_dir): os.makedirs(self.crawled_dir) if not os.path.isdir(self.saved_html_dir): os.makedirs(self.saved_html_dir) if not os.path.isdir(self.gaode_json_dir): os.makedirs(self.gaode_json_dir) def proxy_ip_pool(self): """ 10000 提取过快,请至少5秒提取一次 """ now = time.time() need_new_proxy = False if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict): need_new_proxy = True elif "expire" not in self.proxy_ip_dict.keys(): need_new_proxy = True elif now + 3 > self.proxy_ip_dict["expire"]: need_new_proxy = True if need_new_proxy: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 1 > len(proxies_dict): return self.proxy_ip_dict # still return the old ip dict or {} proxies_dict["expire"] = now + random.randint( self.min_proxy_ip_life_time, self.max_proxy_ip_life_time) # set ip life time self.proxy_ip_dict = proxies_dict return self.proxy_ip_dict def read_crawled_urls(self): """ for resume crawling at a break point """ resume_break_point_detailed_file_path = os.path.join( self.log_dir, self.resume_break_point_detailed_file_name) try: with open(resume_break_point_detailed_file_path, "r", encoding="utf-8") as log_file: self.crawled_detailed_url_list = log_file.readlines() while "" in self.crawled_detailed_url_list: self.crawled_detailed_url_list.remove("") except Exception as ex: error_msg = f"fail to read {resume_break_point_detailed_file_path}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) def start_requests(self): self.init_self_attributes() self.make_dirs() self.read_crawled_urls() if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request(url=url, callback=self.read_and_parse) elif "PRODUCTION_RUN" == self.run_purpose: city_list = self.settings.get("CITY_LIST", default=[]) number_day_of_this_year = datetime.datetime.now().timetuple( ).tm_yday # type == int seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3) if seperate_into_days > len(city_list): seperate_into_days = len(city_list) batch_count = math.ceil(len(city_list) / seperate_into_days) today_batch = number_day_of_this_year % seperate_into_days start_index = today_batch * batch_count - 1 end_index = (today_batch + 1) * batch_count urls = [] for index, city in enumerate(city_list): if (start_index < index) and (index < end_index): urls.append(f"https://{city}.58.com/shangpu/") meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } if self.use_proxy: proxies_dict = self.proxy_ip_pool() if 1 > len(proxies_dict): sys.exit(3) meta_dict["proxy"] = proxies_dict["http"] for url in urls: yield scrapy.Request(url=url, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif "CHECK_PROXY_IP" == self.run_purpose: now = int(time.time()) token = f"Guangzhou{str(now)}" m = hashlib.md5() m.update(token.encode(encoding='utf-8')) urls = [ f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}", ] proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 0 < len(proxies_dict): meta_dict = {"proxy": proxies_dict["http"]} for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug, meta=meta_dict) else: self.logger.error( f"Error! No proxy ip returns. {proxies_dict}") elif "SAVE_ONE_HTML" == self.run_purpose: url = "https://gz.58.com/shangpu/" meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } yield scrapy.Request(url=url, callback=self.do_nothing_for_debug, meta=meta_dict, dont_filter=True) else: urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug) def remove_url_page_part(self, url="", add_query=True): """ this version ignore url_obj.fragment and url_obj.params """ new_url = url url_obj = parse.urlparse(url) if hasattr(url_obj, "path"): url_list = url_obj.path.split("/") path_changed = False for one in url_list: if 0 == one.find("pn"): url_list.remove(one) path_changed = True if path_changed: new_path = "/".join(url_list) new_url = f"{url_obj.scheme}://{url_obj.netloc}" new_url = new_url.rstrip("/") if 0 < len(new_path): new_path = new_path.lstrip("/") new_url = f"{new_url}/{new_path}" if hasattr(url_object, "query") and 0 < len(url_obj.query) and add_query: new_url = f"{new_url}{ url_obj.query }" return new_url def add_url_page_part(self, old_url="", page=2): """ this version ignore url_obj.fragment and url_obj.params """ url_obj = parse.urlparse(old_url) new_url = f"{url_obj.scheme}://{url_obj.netloc}" if not hasattr(url_obj, "path") or 1 > len(url_obj.path): new_path = f"pn{page}" else: temp_list = [] url_list = url_obj.path.split("/") for one in url_list: if 0 == one.find("pn"): continue if 0 < len(one): temp_list.append(one) temp_list.append(f"pn{page}") new_path = "/".join(temp_list) new_url = f"{url_obj.scheme}://{url_obj.netloc}" new_url = new_url.rstrip("/") new_path = new_path.lstrip("/") new_url = f"{new_url}/{new_path}" new_url = new_url.rstrip("/") if hasattr(url_obj, "query") and 0 < len(url_obj.query): new_url = f"{new_url}/?{ url_obj.query }" return new_url def get_page_from_url(self, url=""): page_num = 0 url_obj = parse.urlparse(url) if hasattr(url_obj, "path"): url_list = url_obj.path.split("/") for one in url_list: if 0 == one.find("pn"): page_num = CommonClass.find_digits_from_str( string=one, return_all=False) return int(page_num) def get_total_pages(self, response=None): total_pages = 0 if response is None: return total_pages page_list = response.xpath( "//div[@class='content-side-left']/div[@class='pager']/a/@href" ).extract() for one in page_list: this_url_page_num = self.get_page_from_url(url=one) if total_pages < this_url_page_num: total_pages = this_url_page_num if 1 > total_pages: error_msg = f"fail to extract last page number ({page_list}) from {response.url} or this url has ONLY one page" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return total_pages def get_city_from_url(self, url=""): city = "" result_obj = parse.urlparse(url) if -1 < result_obj.netloc.find("58.com"): temp2_list = result_obj.netloc.split(".") if 3 == len(temp2_list): city = temp2_list[0] return city def get_page_area_district_from_url(self, url_object=None): """ https://fs.58.com/shangpucz/ https://gz.58.com/shangpu/ https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50 https://fs.58.com/foshan/shangpucz/pn2/ # foshan == 佛山周边,与禅城、高明、三水等同级 https://gz.58.com/shangpucz/pn3/ https://fs.58.com/shangpu/38143746902823x.shtml """ page = "1" district = "" shop_area = "" detailed_page = False if url_object is not None and hasattr( url_object, "netloc") and -1 < url_object.netloc.find("58.com"): # parse query has_shop_area = True if not hasattr(url_object, "query") or 1 > len(url_object.query): has_shop_area = False if has_shop_area: query_dict = parse.parse_qs(url_object.query) if "area" in query_dict.keys() and isinstance( query_dict["area"], list) and 0 < len(query_dict["area"]): shop_area = query_dict["area"][0] # parse path if hasattr(url_object, "path"): url_list = url_object.path.split("/") temp_list = [] for one in url_list: if 0 < len(one) and -1 == one.find( "shangpucz") and -1 == one.find( "shangpu") and -1 == one.find("pn"): temp_list.append(one) elif -1 < one.find("pn"): page = CommonClass.find_digits_from_str( string=one, return_all=False) elif -1 < one.find(".shtml"): detailed_page = True if not detailed_page and 1 == len(temp_list): district = temp_list[0] if detailed_page: page = "0" return (page, district, shop_area) def make_html_file_name(self, url="", city=""): """ https://fs.58.com/shangpucz/ https://gz.58.com/shangpu/ https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50 https://fs.58.com/foshan/shangpucz/pn2/ https://gz.58.com/shangpucz/pn3/ https://fs.58.com/shangpu/38143746902823x.shtml """ now = datetime.datetime.now() html_filename = "{}.html".format(now.strftime("%Y%m%d_%H%M%S")) today = now.strftime("%Y%m%d") result_obj = parse.urlparse(url) url_list = result_obj.path.split("/") while "" in url_list: url_list.remove("") detail_page = False last_part = url_list[len(url_list) - 1] if 0 < len(url_list) else "" if -1 < last_part.find(".shtml"): detail_page = True # https://fs.58.com/shangpu/38143746902823x.shtml shop_id = last_part.rstrip(".shtml") html_filename = f"{city}___{shop_id}___{today}.html" elif -1 < result_obj.netloc.find("58.com") and 1 == len( url_list) and url_list[0] in [ "shangpucz", "shangpu", ]: # list page #1: https://fs.58.com/shangpucz/ html_filename = f"{city}___all___all___index1___{today}.html" else: page, district, shop_area = self.get_page_area_district_from_url( url_object=result_obj) if -1 < shop_area.find("500_"): shop_area = "over500" if 0 < len(district) and 0 < len(shop_area): html_filename = f"{city}___{district}___{shop_area}___index{page}___{today}.html" elif 0 < len(district): html_filename = f"{city}___{district}___all___index{page}___{today}.html" elif 0 < len(shop_area): html_filename = f"{city}___all___{shop_area}___index{page}___{today}.html" else: html_filename = f"{city}___all___all___index{page}___{today}.html" return (detail_page, html_filename) def get_shop_id(self, url=""): shop_id = f"random{random.randint( 10000, 99999 )}" url_obj = parse.urlparse(url) if hasattr(url_obj, "path"): url_list = url_obj.path.split("/") for one in url_list: if -1 < one.find(".shtml"): shop_id = one.rstrip(".shtml") return shop_id def save_html(self, response=None, save70=False): """ returns -1: wrong response object -2: fail to write response.body 1001: this is a detailed page 101: more than 69 pages 0 to 70: page number; 0:detailed page or fail to extract total page from list page """ if response is None or not hasattr(response, "meta") or not hasattr( response, "body") or not hasattr(response, "url"): if hasattr(response, "url"): error_msg = f"fail to save response.body after requesting {response.url}; response has no body or meta attribute(s)" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return -1 url = response.url meta_dict = response.meta page_type = "index" total_pages = 0 city = self.get_city_from_url(url=url) if "page_type" in meta_dict.keys(): page_type = meta_dict["page_type"] if "index" == page_type: if "total_pages" in meta_dict.keys(): total_pages = int(meta_dict["total_pages"]) if 0 == total_pages: total_pages = self.get_total_pages(response=response) if 69 < total_pages and not save70: return 101 detail_page, html_filename = self.make_html_file_name(url=url, city=city) html_file_path = os.path.join(self.saved_html_dir, html_filename) elif "detailed" == page_type: total_pages = 1001 today = datetime.datetime.now().strftime("%Y%m%d") shop_id = self.get_shop_id(url=url) html_filename = f"{city}___{shop_id}___{today}.html" html_file_path = os.path.join(self.saved_html_dir, html_filename) try: with open(html_file_path, "wb") as f: f.write(response.body) except Exception as ex: error_msg = f"fail to write response.body into {html_file_path} after requesting {url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return -2 else: if 1 > total_pages: error_msg = f"response.body saved after requesting {response.url}; but fail to extract total page number from response.body" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return total_pages # could be 70 when save70 == True def divide_request_into_next_level(self, response=None): """ # returns: (-1, [], -1): wrong response object (-2, [], 2): already using shop area as level 2, currently we only have levels up to 2 (-3, [], -1): this response.url is already page #2 or more (-4, [], -1): this page is a detailed page (-11, [], index_level): fail to extract links from response.body (-12, [], index_level): same as (-2, [], 2) (-13, [], index_level): wrong parameter (index_level) (pointer, district_list, index_level): 0 == pointer; district_list is a []; index_level is int and in [0, 1, 2] 0 == index_level: this url is for whole city like guangzhou, foshan, or shenzhen 1 == index_level: this url is for one district like tianhe, baiyun, panyu, or others in guangzhou 2 == index_level: this url is for one shop_area size listed in self.shop_area_uri_list https://fs.58.com/shangpucz/ https://gz.58.com/shangpu/ https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50 https://fs.58.com/foshan/shangpucz/pn2/ https://gz.58.com/shangpucz/pn3/ """ if response is None or not hasattr(response, "meta") or not hasattr( response, "body") or not hasattr(response, "url"): error_msg = f"meta = {hasattr( response, 'meta' )}; body = {hasattr( response, 'body' )}; url = {hasattr( response, 'url' )}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return (-1, [], -1) url = response.url url_obj = parse.urlparse(url) page, district, shop_area = self.get_page_area_district_from_url( url_object=url_obj) if 0 < len(shop_area): page_status = self.save_html(response=response, save70=True) self.write_log(content=f"{response.url}", logfilename=self.over70_filename, content_only=True) return (-2, [], 2) if 1 < int(page): page_status = self.save_html(response=response, save70=True) return (-3, [], -1) elif 0 == int(page): return (-4, [], -1) index_level = self.get_index_level(response=response, district=district) pointer, link_list = self.extract_this_level_options( response=response, index_level=index_level, district=district) if pointer in [ -11, -12, ]: page_status = self.save_html(response=response, save70=True) self.write_log(content=f"{response.url}", logfilename=self.over70_filename, content_only=True) return (pointer, link_list, index_level) def get_index_level(self, response=None, district=""): meta_dict = response.meta index_level = 0 if "index_level" in meta_dict.keys(): index_level = int(meta_dict["index_level"]) url_obj = parse.urlparse(response.url) query_dict = url_obj.query if hasattr(url_obj, "query") else {} if 0 < len(query_dict): query_dict = parse.parse_qs(query_dict) # district and area have higher priority than index_level if 0 < len(query_dict) and "area" in query_dict.keys(): if 2 != index_level: index_level = 2 elif 0 < len(district) and 1 != index_level: error_msg = f"index_level {index_level} != ({district})" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) index_level = 1 elif 0 == len(district) and 0 != index_level: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {index_level} is not 0" ) index_level = 0 return index_level def make_new_url(self, parent_level_url="", index_level=0, fragment=""): """ make one child url according to parent url https://fs.58.com/shangpucz/ https://gz.58.com/shangpu/ https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50 https://fs.58.com/foshan/shangpucz/pn2/ # foshan == 佛山周边,与禅城、高明、三水等同级 https://gz.58.com/shangpucz/pn3/ """ parent_url_obj = parse.urlparse(parent_level_url) child_url = f"{parent_url_obj.scheme}://{parent_url_obj.netloc}" child_url = child_url.rstrip("/") if 1 == index_level: # it is parent's index_level return f"{child_url}/{parent_url_obj.path.lstrip('/')}?area={fragment}" elif 0 == index_level: return f"{child_url}/{fragment.strip('/')}/shangpucz/" else: return "" def extract_district_from_url_paths(self, district_list=[]): """ /shangpucz/ /tianhe/shangpucz/ /haizhu/shangpucz/ """ return_list = [] for one_link in district_list: url_list = one_link.split("/") good_url_list = [] for good_url in url_list: if 0 < len(good_url) and -1 == good_url.find( "shangpucz") and -1 == good_url.find("shangpu"): # and -1 == good_url.find("pn") and -1 == one.find(".shtml"): good_url_list.append(good_url) if 1 == len(good_url_list): return_list.append(good_url_list[0]) return return_list def extract_this_level_options(self, response=None, index_level=0, district=""): """ # returns: ( 0, [a list has one element or more] ) ( -11, [] ): fail to extract links from response.body ( -12, [] ): already 2 == index_level ( -13, [] ): wrong parameter ( index_level ) """ district_list = [] if 0 == index_level: district_dl = response.xpath( '//div[@class="filter-wrap"]/dl[@class="secitem"]') for one_district in district_dl: dl_dtitle = one_district.xpath("./dt/text()").extract_first( default="") if 0 == dl_dtitle.find("区域:"): break district_list = one_district.xpath("./dd/a/@href").extract() if 0 < len(district_list): district_list = self.extract_district_from_url_paths( district_list=district_list) if 0 < len(district_list): return (0, district_list) error_msg = f"fail to extract links from response.body after requesting {response.url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return (-11, []) elif 1 == index_level: return (0, self.shop_area_uri_list) elif 2 == index_level: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, 2 == index_level; we will NOT divide further" ) return (-12, []) else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, wrong index_level ({index_level})" ) return (-13, []) def load_items_into_loader(self, loader=None, text={}, url=""): loader.add_value("content", str(text)) # , encoding="utf-8" loader.add_value("page_type", "detailed") # record housekeeping fields loader.add_value("url", url) loader.add_value("project", self.settings.get('BOT_NAME')) loader.add_value("spider", self.name) loader.add_value("server", socket.gethostname()) loader.add_value("date", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) return loader def extract_shop_id_from_href(self, shop_id="", use_logr=False): """ # href: https://jxjump.58.com/service?target=FCADV8oV3os7xtAj_6pMK7rUlr7DdRMx8H_54olt8EXOWkK_Zpk1zEffDjhGKDukKaSGKEtf3gzeNV-\ jc68R330iX4JeiOAQ9mxZIx_7k2_EqtKoFph2NZi5EUpVl9S607kui9wZ5vFL9FjgOWSrSlIBohzi3WsLQSp_Rr-QuiAazy31jEubeh76kg5T_\ uVyVN1UCEVsUjMvAnmEU0sZOrGXZEsuraI5DWpE1qXSASL8rH4cOWSrSlIBoh7ifevBw4N33&pubid=75911118&apptype=0&psid=161954686204511925276830641&\ entinfo=38284513452802_0&cookie=%7C%7C%7C&fzbref=0&key¶ms=busitime^desc # logr: z_2_33120284640267_38420286183181_1_2_sortid:613502485@postdate:1560301497000 gz_2_55687810204183_36683482092955_sortid:599933703@postdate:1560268805000@ses:busitime^desc@pubid:76309565 """ shop_id_str = "" if use_logr: seen_sortid = False if isinstance(shop_id, str): temp_list = shop_id.split("_") if 0 < len(temp_list): temp_list.reverse() for one in temp_list: if 0 == one.find("sortid"): seen_sortid = True if seen_sortid and 14 == len(one): return one else: url_obj = None if isinstance(shop_id, str): url_obj = parse.urlparse(shop_id) if hasattr(url_obj, "query"): query_dict = parse.parse_qs(url_obj.query) if isinstance(query_dict, dict) and "entinfo" in query_dict.keys(): shop_id_str = query_dict["entinfo"] if isinstance(shop_id_str, list) and 0 < len(shop_id_str): shop_id_str = shop_id_str[0] if isinstance(shop_id_str, str) and -1 < shop_id_str.find("_"): temp_list = shop_id_str.split("_") shop_id_str = temp_list[0] return shop_id_str def parse_list_response_field(self, response=None, city=""): text_list = [] if response is None: return text if "READ_HTML" == self.run_purpose and not isinstance( response, Selector): return text shops = response.xpath( '//div[@class="content-wrap"]/div[@class="content-side-left"]/ul[@class="house-list-wrap"]/li[@logr]' ) for one_shop in shops: try: shop_id = one_shop.xpath( "./div[@class='list-info']/h2[@class='title']/a/@href" ).extract_first(default='') shop_id = self.extract_shop_id_from_href(shop_id=shop_id, use_logr=False) if 1 > len(shop_id): shop_id = one_shop.xpath("./@logr").extract_first( default='') shop_id = self.extract_shop_id_from_href(shop_id=shop_id, use_logr=True) title = one_shop.css( 'div.list-info h2.title a span.title_des::text' ).extract_first(default='') baseinfo_list = one_shop.css('div.list-info p.baseinfo') description = "" baseinfo_items = [] address = "" for index, onelist in enumerate(baseinfo_list): temp = onelist.css("span::text").extract() if 0 < len(temp): baseinfo_items += temp if index + 1 == len(baseinfo_list): address = temp[len(temp) - 1] if 0 < len(baseinfo_items): description = "___descr___".join(baseinfo_items) tags = "" tag_list = one_shop.xpath( "./div[@class='list-info']/p[@class='tag-wrap']/span/text()" ).extract() if 0 < len(tag_list): tags = "___tags___".join(tag_list) price_box = one_shop.css('div.price') price_sum = price_box.css('p.sum b::text').extract_first( default='') price_sum_unit = price_box.css( 'p.sum span::text').extract_first(default='') unitprice = price_box.css('p.unit span::text').extract_first( default='') unitprice_unit_list = price_box.css('p.unit::text').extract() unitprice_unit = unitprice_unit_list[ len(unitprice_unit_list) - 1].strip() if 0 < len(unitprice_unit_list) else "" text = { "shop_id": shop_id, "city": city, "title": title.strip(), "description": description.strip(), "address": address, "tags": tags, "price_sum": price_sum.strip(), "price_sum_unit": price_sum_unit.strip(), "unitprice": unitprice.strip(), "unitprice_unit": unitprice_unit.strip(), } text_list.append(text) except Exception as ex: error_msg = f"Error happened during parsing. Exception = {ex}; one_shop = {one_shop}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) continue return text_list def parse_detailed_page(self, response=None): self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, todo..." ) pass def do_nothing_for_debug(self, response=None): self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}" ) print(response.body) # Inside Method request_proxy_ip of Class ProxyAgent, proxy server returns [{'IP': '49.87.226.131:10749'}] # b'{"REMOTE_ADDR":"49.87.226.131","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"49.87.226.131, 49.87.226.131"}' def url_contains_error(self, url_obj=""): """ we do not know any anticrawl method by 58.com yet """ if hasattr(url_obj, "path"): pass return False def make_next_pages_url_from_page_one(self, url="", index_level_int=0, page_number_int=0): urls = [] if 2 > page_number_int or 0 > index_level_int or 2 < index_level_int: return urls page = self.get_page_from_url(url=url) if 1 < page: return urls # we ONLY do this at Page 1 elif 1 == page: # url contains /pnxxx/ part; then remove it new_url = self.remove_url_page_part(url=url) else: new_url = url.rstrip("/") if 2 > index_level_int: # 0 == index_level_int: https://gz.58.com/shangpucz/pn3/ # 1 == index_level_int: https://fs.58.com/foshan/shangpucz/pn2/ for i in range(page_number_int - 1): urls.append(self.add_url_page_part(old_url=url, page=(i + 2))) else: # https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50 for i in range(page_number_int - 1): urls.append(self.add_url_page_part(old_url=url, page=(i + 2))) return urls def parse_list_page(self, response=None): page_status = self.save_html(response=response, save70=False) url_obj = parse.urlparse(response.url) no_url_error = self.url_contains_error(url_obj=url_obj) load_this_page_items = False if 1 > page_status: pass # -2, -1, 0: error_msg has been logged; just pass elif 0 < page_status and 101 > page_status and not no_url_error: # 1 to 70 also means "index" == page_type load_this_page_items = True if 1 < page_status: # ONLY reponsed html having total page more than 1 will go further page, district, shop_area = self.get_page_area_district_from_url( url_object=url_obj) if 1 == int(page): # ONLY do this on Page #1 index_level = self.get_index_level(response=response, district=district) urls = self.make_next_pages_url_from_page_one( url=response.url, index_level_int=index_level, page_number_int=page_status) meta_dict = { "page_type": "index", "total_pages": page_status, "index_level": index_level, } for one_url in urls: yield scrapy.Request(url=one_url, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif 101 == page_status and not no_url_error: # 101 also means "index" == page_type pointer, link_list, index_level = self.divide_request_into_next_level( response=response) if pointer in [ -2, -3, -11, -12, ]: load_this_page_items = True elif -1 < pointer: # going to request all children level list page meta_dict = { "page_type": "index", "total_pages": 0, "index_level": index_level + 1, } if self.use_proxy: proxies_dict = self.proxy_ip_pool() meta_dict["proxy"] = proxies_dict["http"] for i in range(len(link_list) - pointer): new_url = self.make_new_url(parent_level_url=response.url, index_level=index_level, fragment=link_list[i + pointer]) if 0 < len(new_url): self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, requesting {new_url}; meta_dict = {meta_dict}" ) yield scrapy.Request(url=new_url, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) elif 1001 == page_status and not no_url_error: self.parse_detailed_page(response=response) # 1001 also means "detailed" == page_type # will never reach here because self.parse_detailed_page() is the callback method if load_this_page_items: url = response.url city = self.get_city_from_url(url=url) text_list = self.parse_list_response_field(response=response, city=city) try: for text in text_list: loader = ItemLoader(item=Shop58Item(), response=response) loader = self.load_items_into_loader(loader=loader, text=text, url=url) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}" ) def read_and_parse(self, response=None): self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}. under developing..." ) pass # file_list = os.listdir( self.saved_html_dir ) # for one_file in file_list: # if -1 == one_file.find("index"): # temp_list = one_file.split("___") # apt_id = 0 # city = "" # if 1 < len( temp_list ): # apt_id = temp_list[1] # city = temp_list[0] # url = f"https://{city}.esf.fang.com/chushou/3_{apt_id}.htm" # html_file_path = os.path.join( self.saved_html_dir, one_file ) # if os.path.isfile(html_file_path): # doc = None # with open( html_file_path,'rb') as f: # # doc = f.read().decode('gb2312', 'ignore') # doc = f.read().decode('utf-8', 'ignore') # if doc is None: # self.logger.error( f"Error: cannot read html file {html_file_path}.") # continue # response = Selector( text=doc, type="html" ) # text_list = self.parse_list_response_field( response = response, city = city, apt_id = apt_id ) # try: # for text in text_list: # loader = ItemLoader( item = Shop58Item(), response = response ) # loader = self.load_items_into_loader( loader = loader, text = text, url = url ) # yield loader.load_item() # except Exception as ex: # self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}" ) # if self.debug: # break def write_log(self, content=None, logfilename=None, content_only=False): if content is not None and 0 < len(content): today = datetime.datetime.now().strftime("%Y%m%d") if logfilename is None: logfilename = f"{self.name}{today}.log" try: with open(os.path.join(self.log_dir, logfilename), 'a', encoding='utf-8') as f: if content_only: info = f"{str(content)}\n" else: info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n" f.write(info) return 1 except Exception as ex: return 0 return -1
class QqhouseSpider(scrapy.Spider): """ sys.exit code == 1 # missing CITIES_FOR_CRAWLING sys.exit code == 2 # wrong or missing CITY_PAGE_DICT sys.exit code == 3 # wrong value(s) of CITY_PAGE_DICT On 20190527 Peter re-write this spider for fixing bugs """ name = "qqhouse" root_path = "" run_purpose = None missed_id_txt_filename = "" maximal_request_times = [] debug = None city_page_dict = {} maximal_list_pages = 0 city_list = [] save_every_response = False crawled_dir = "" detail_html_dir = "" list_html_dir = "" output_folder_name = "" log_dir = "" custom_settings = CommonClass.get_custom_settings_dict( spider=name ) date_list = [] def init_self_attributes(self): self.root_path = self.settings.get( "PROJECT_PATH" ) self.run_purpose = self.settings.get( name = "RUN_PURPOSE", default=None ) self.missed_id_txt_filename = self.settings.get( name = "MISSED_ID_TXT", default="" ) self.maximal_request_times = self.settings.get( name = "MAXIMAL_REQUEST_TIMES", default=[] ) self.debug = self.settings.get( name = "PROJECT_DEBUG", default=False ) self.city_page_dict = self.settings.get( name = "CITY_PAGE_DICT", default={} ) self.maximal_list_pages = self.settings.get( name = "MAXIMAL_LIST_PAGES", default=0 ) self.city_list = self.settings.get( name = "CITIES_FOR_CRAWLING", default=[] ) self.save_every_response = self.settings.get( name = "SAVE_EVERY_RESPONSE", default=False ) self.crawled_dir = self.settings.get( name="CRAWLED_DIR", default = "" ) self.detail_html_dir = self.settings.get( name="SAVED_DETAIL_HTML", default="" ) self.list_html_dir = self.settings.get( name="SAVED_LIST_HTML", default="" ) self.output_folder_name = self.settings.get( name="OUTPUT_FOLDER_NAME", default="" ) self.log_dir = self.settings.get( name="LOG_DIR", default="" ) def make_dirs(self): # even cache is used, we save all html files; here we make these 3 dirs if they do not exist if not os.path.isdir( self.crawled_dir ): os.makedirs( self.crawled_dir ) if not os.path.isdir( self.detail_html_dir ): os.makedirs( self.detail_html_dir ) if not os.path.isdir( self.list_html_dir ): os.makedirs( self.list_html_dir ) def start_requests(self): self.init_self_attributes() self.make_dirs() urls = [] callback_func = self.parse_list meta_dict = {"page_type": "list"} if 1 > len( self.city_list ): self.logger.error( f"self.city_list can NOT be empty." ) sys.exit(1) for one_city in self.city_list: if one_city not in self.city_page_dict.keys(): self.logger.error( f"{one_city} is NOT in {self.city_page_dict}" ) sys.exit(2) if 1 > int( self.city_page_dict[ one_city ] ): self.logger.error( f"Wrong value of {self.city_page_dict[one_city]} (key == {one_city})" ) sys.exit(3) if 0 != self.maximal_list_pages and self.maximal_list_pages < int( self.city_page_dict[ one_city ] ): self.city_page_dict[ one_city ] = self.maximal_list_pages for one_city in self.city_list: for i in range( int( self.city_page_dict[ one_city ] ) ): # list_page urls urls.append( f"https://db.house.qq.com/index.php?mod=search&act=newsearch&city={one_city}&showtype=1&page_no={i+1}" ) if self.debug: self.logger.debug( urls ) urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] callback_func = self.do_nothing_for_debug elif "REDO_MISSED_HOUSE_IDS" == self.run_purpose: # REDO_MISSED_HOUSE_IDS is a special debug, HTTPCACHE_ENABLED == False before running REDO_MISSED_HOUSE_IDS urls = [] try: file_path = os.path.join( self.root_path, self.name, self.output_folder_name, self.missed_id_txt_filename ) city = "gz" with open( file_path, "r" ) as f: for one_id in f.readlines(): one_id = one_id.replace("\n", "") urls.append( one_id ) # f"https://db.house.qq.com/{city}_{one_id}/" except Exception as ex: self.logger.error( f"failed to read missed_id_txt_file from {file_path}. Exception = {ex}" ) sys.exit(4) callback_func = self.parse_detailed elif "REDO_MISSED_PAGE_IDS" == self.run_purpose: # REDO_MISSED_PAGE_IDS is a special debug, HTTPCACHE_ENABLED == False before running REDO_MISSED_PAGE_IDS urls = [] try: file_path = os.path.join( self.root_path, self.name, self.output_folder_name, self.missed_id_txt_filename ) city = "gz" with open( file_path, "r" ) as f: for one_id in f.readlines(): one_id = one_id.replace("\n", "") urls.append( f"https://db.house.qq.com/index.php?mod=search&act=newsearch&city={city}&showtype=1&page_no={one_id}" ) except Exception as ex: self.logger.error( f"failed to read missed_id_txt_file from {file_path}. Exception = {ex}" ) sys.exit(4) callback_func = self.parse_list elif "READ_CSV_TO_KAFKA" == self.run_purpose: temp_list = self.settings.get( name="DATES_TO_BE_READ", default=[] ) for one in temp_list: if isinstance( one, Iterable) and 0 < len( one ): temp_list.append( one ) if 0 < len( temp_list ): self.date_list = temp_list callback_func = self.read_csv_to_kafka urls = [ "http://quotes.toscrape.com/page/1/", ] if self.run_purpose in ["REDO_MISSED_HOUSE_IDS", "REDO_MISSED_PAGE_IDS", "READ_CSV_TO_KAFKA", ]: for url in urls: yield scrapy.Request( url=url, callback=callback_func, meta = meta_dict, dont_filter = True ) else: for url in urls: yield scrapy.Request( url=url, callback=callback_func, meta = meta_dict ) def read_csv_to_kafka(self, response): # do not go to pipeline and just read csv file and produce message to Kafka if 1 > len( self.date_list ): return False for one_date in self.date_list: folder_name = f"{one_date}crawled" crawled_dir = os.path.join( self.root_path, self.name, self.output_folder_name, f"{today}crawled" ) csv_file_path = os.path.join( crawled_dir, f"qqhouse{one_date}.csv" ) if os.path.isdir( crawled_dir ) and os.path.isfile( csv_file_path ): with open( csv_file_path, newline="" ) as csvfile: file_reader = csv.reader(csvfile) # , delimiter=' ', quotechar='|' for row in file_reader: temp_dict = eval(row) print( temp_dict ) print( type( temp_dict ) ) def do_nothing_for_debug(self, response): self.logger.info( f"inside Method do_nothing_for_debug of Class QqhouseSpider. url = {response.url}" ) def load_items_into_loader(self, loader = None, text = {}, url = ""): loader.add_value( 'content', str(text) ) # , encoding="utf-8" loader.add_value( 'page_type', "detailed" ) # record housekeeping fields loader.add_value('url', url) loader.add_value('project', self.settings.get('BOT_NAME') ) loader.add_value('spider', self.name ) loader.add_value('server', socket.gethostname() ) loader.add_value('date', datetime.datetime.now().strftime("%Y%m%d_%H%M%S") ) return loader def get_list_html_file_path( self, city = "", page_no = 0 ): if 1 > len( city ) or 1 > page_no: return "" return os.path.join( self.list_html_dir, f"{city}_list_{page_no}.html" ) def find_more_house_ids(self, doc = ""): house_id_list = [] counter = 0 index = 0 while True: index = doc.find("data-hid", index) if -1 == index: break sub_doc = doc[index+10:index+25] house_id_list.append( CommonClass.find_digits_from_str( sub_doc ) ) index += 10 counter += 1 return house_id_list def extract_all_detailed_html_links(self, string = ""): house_id_list = [] if 1 > len( string ): return house_id_list doc = string.decode('utf-8') end_string = '";var search_result_list_num =' end_pos = len( doc ) if -1 < doc.find( end_string ): end_pos = doc.find( end_string ) doc = doc[ len('var search_result = " '):end_pos ] doc = '<!DOCTYPE html><html><head lang="zh-cn"><title>腾讯房产列表</title></head><body>' + f"{doc}</body></html>" response = Selector( text=doc, type="html" ) house_id_list = response.xpath("//div/@data-hid").extract() if 10 > len( house_id_list ): house_id_list = self.find_more_house_ids( doc = doc ) else: temp_list = [] for one_id in house_id_list: temp_list.append( CommonClass.find_digits_from_str( one_id ) ) house_id_list = temp_list return house_id_list def parse_list(self, response = None): url = response.url city = "" page_no = 1 page_type = "list" if hasattr( response, "meta" ) and "page_type" in response.meta.keys(): page_type = response.meta["page_type"] if "list" == page_type: if 10 > len( str( response.body ) ): # cannot use 1 > ... meta_dict = self.request_counter_and_action(response = response) if 0 < meta_dict["request_counter"]: yield scrapy.Request( url=url, callback=self.parse_list, meta = meta_dict, dont_filter = True ) else: house_id_list = [] query_part_list = url.split("?") if 2 == len( query_part_list ): result_dict = parse.parse_qs( query_part_list[1] ) if "city" in result_dict.keys() and 0 < len( result_dict["city"] ): city = result_dict["city"][0] if "page_no" in result_dict.keys() and 0 < len(result_dict["page_no"]) and 1 < int( result_dict["page_no"][0] ): page_no = int( result_dict["page_no"][0] ) if self.save_every_response: list_html_file_path = self.get_list_html_file_path( city, page_no ) if 0 < len( list_html_file_path ): self.save_html( response = response, page_type = "list", city = city, page_no= str(page_no), house_id = "" ) house_id_list = self.extract_all_detailed_html_links( response.body ) # counter = 0 for one_id in house_id_list: next_url = f"https://db.house.qq.com/{city}_{one_id}/" self.logger.info( f"crawling next url at {next_url}" ) yield response.follow( next_url, self.parse_detailed ) else: self.logger.error( f"page_type ({page_type}) is NOT \"list\" in parse_list Method. url = {url}" ) def load_items_into_loader(self, loader = None, text = {}, url = ""): loader.add_value( 'content', str(text) ) # , encoding="utf-8" loader.add_value( 'page_type', "detailed" ) # record housekeeping fields loader.add_value('url', url) loader.add_value('project', self.settings.get('BOT_NAME') ) loader.add_value('spider', self.name ) loader.add_value('server', socket.gethostname() ) loader.add_value('date', datetime.datetime.now().strftime("%Y%m%d_%H%M%S") ) return loader def save_html(self, response=None, page_type = "detailed", city = "", page_no="", house_id = "" ): if response is None or not hasattr(response, "body") or not hasattr( response, "url" ): return False doc = response.body if "detailed" == page_type: temp_str = str( house_id ).zfill(8) file_path = os.path.join( self.detail_html_dir, f"{city}_{temp_str}.html" ) elif "list" == page_type: temp_str = str( page_no ).zfill(4) file_path = os.path.join( self.list_html_dir, f"{city}_list{temp_str}.txt" ) else: return False try: with open( file_path, 'wb' ) as f: f.write( doc ) except Exception as ex: self.logger.warning( f"failed to write response.body from {response.url}" ) return False return True def request_counter_and_action(self, response = None): request_counter = 0 request_pointer = 0 if hasattr( response, "meta" ) and "request_pointer" in response.meta.keys(): request_pointer = int( response.meta["request_pointer"] ) if hasattr( response, "meta" ) and "request_counter" in response.meta.keys(): request_counter = int(response.meta["request_counter"]) if request_pointer < len( self.maximal_request_times ): self.logger.info( f"request_counter == {request_counter}; request_pointer == {request_pointer} for the last request from {response.url}" ) if request_counter < self.maximal_request_times[request_pointer]: return { "request_counter": request_counter + 1, "request_pointer": request_pointer, } else: return { "request_counter": 1, "request_pointer": request_pointer + 1, } else: today = datetime.datetime.now().strftime("%Y%m%d") self.logger.error( f"{self.maximal_request_times} requests have been sent but ONLY empty response.body received from {response.url}" ) self.write_log( content = response.url, logfilename = f"missed_uris{today}.txt", content_only = True) return { "request_counter": -1, "request_pointer": request_pointer, } def extract_detailed_elements( self, response = None, city = "", house_id = "" ): text = {} # parse fields previously required big_box = response.css("div.item.fl") real_estate_name = response.css("div.name.fl div.cf h2::text").extract_first(default="") real_estate_slogan = big_box.css("div.hd.cf h1.Pagetitle::text").extract_first(default="") price_label = big_box.css("div.hd.cf h2.fl.yh.cf em.itemHeader::text").extract_first(default="") price_span_list = big_box.css("div.hd.cf h2.fl.yh.cf span.price::text").extract() price_span_money = big_box.css("div.hd.cf h2.fl.yh.cf span.price strong::text").extract_first(default="") if 2 == len( price_span_list ): price_str = f"{price_span_list[0]}___price___{price_span_money}___price___{price_span_list[1]}" else: price_str = "___price___".join(price_span_list) price_str = f"{price_str}___price___{price_span_money}" detail_lis = big_box.css( "ul.itemContent.itemContent3.pr li" ) items = [] for one_li in detail_lis: em_element = one_li.css( "em.itemHeader" ) if em_element is not None and 0 < len(em_element): item_value_list = one_li.css( "::text" ).extract() item_value = "" if 1 < len( item_value_list ): for index, value in enumerate(item_value_list): item_value_list[index] = value.strip() item_value += str("".join(item_value_list)) elif 1 == len( item_value_list ): item_value = item_value_list[0].strip() if "" != item_value: item_value = CommonClass.replace_string( string = item_value, char_to_remove = ['\r', '\n', '\t', ' ',], new_char = "___break___" ) items.append( item_value ) else: continue item_string = "" if 0 < len(items): item_string = "___descr___".join(items) if "" != item_string or "" != real_estate_name or "" != price_label or "" != price_str: text["real_estate_name"] = real_estate_name text["real_estate_slogan"] = real_estate_slogan text["price_label"] = price_label text["price_str"] = price_str text["item_string"] = item_string text["city"] = city text["house_id"] = house_id # parse fields required on 20190528 basic_info_box = response.css("div#xxIntr ul.hdl.ft") all_lis = basic_info_box.xpath("./li") item_list = [] for one_li in all_lis: key = one_li.xpath("./span/text()").extract_first(default="") value = one_li.xpath("./p/text()").extract_first(default="") if 0 < len( key ) and 0 < len( value ): item_list.append( f"{key}___key2value___{value}" ) if 0 < len( item_list ): text["basic_info"] = "___basic___".join( item_list ) return text def parse_detailed(self, response = None): # response=response.replace(encoding="gb2312") # do NOT use this line url = response.url doc = response.body doc = doc.decode("gb2312", "ignore") if 1 > len( str( doc ) ): meta_dict = self.request_counter_and_action(response = response) if 0 < meta_dict["request_counter"]: yield scrapy.Request( url=url, callback=self.parse_detailed, meta = meta_dict, dont_filter = True ) else: city = "" house_id = "" url_list = url.split( "qq.com" ) if 2 == len( url_list ): temp_list = url_list[1].replace("/", "") temp_list = temp_list.split("_") if 2 == len( temp_list ): city = temp_list[0] house_id = temp_list[1] if 0 < len( city) and 0 < len( house_id ): self.save_html( response = response, page_type = "detailed", city = city, house_id = house_id ) text = {} try: response2 = Selector(text=doc, type="html") text = self.extract_detailed_elements( response = response2, city = city, house_id = house_id ) except Exception as ex: self.logger.error( f"Error! Exception = {ex}; text = {text}" ) else: if 0 < len( text ): try: loader = ItemLoader( item = QqhouseItem(), response = response ) loader = self.load_items_into_loader( loader = loader, text = text, url = url ) yield loader.load_item() except Exception as ex: self.logger.error( f"Error happened during loading ItemLoader in Method parse_detailed of Class QqhouseSpider. Exception = {ex}" ) def write_log(self, content = None, logfilename = None, content_only = False): if content is not None and 0 < len( content ): today = datetime.datetime.now().strftime("%Y%m%d") if logfilename is None: logfilename = f"{self.name}{today}.log" try: with open( os.path.join( self.log_dir, logfilename ), 'a', encoding='utf-8') as f: if content_only: info = f"{str(content)}\n" else: info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n" f.write(info) return 1 except Exception as ex: return 0 return -1
def start_requests(self): self.init_self_attributes() self.make_dirs() self.read_crawled_urls() if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request(url=url, callback=self.read_and_parse) elif "PRODUCTION_RUN" == self.run_purpose: urls = [ # 广州 "https://land.3fang.com/market/440100__1______1_1_1.html", # 住宅用地: 26页 "https://land.3fang.com/market/440100__2______1_1_1.html", # 商业/办公用地: 17页 "https://land.3fang.com/market/440100__3_2__0_100000__1_1_1.html", # 工业用地, 已成交, 10万平米以下: 32页 "https://land.3fang.com/market/440100__3_2__100000_500000__1_1_1.html", # 工业用地, 已成交, 10-50万平米: 4页 "https://land.3fang.com/market/440100__3_2__500000_100000000__1_1_1.html", # 工业用地, 已成交, 50万平米以上: 1页 "https://land.3fang.com/market/440100__3_1_____1_1_1.html", # 工业用地, 未成交: 1页 "https://land.3fang.com/market/440100__3_3_____1_1_1.html", # 工业用地, 流拍: 7页 "https://land.3fang.com/market/440100__4______1_1_1.html", # 其他用地: 4页 # # 佛山 "https://land.3fang.com/market/440600__1_1_____1_1_1.html", # 住宅用地, 未成交: 8页 "https://land.3fang.com/market/440600__1_2__0_5000__1_1_1.html", # 住宅用地, 已成交, 5千平米以下: 33页 "https://land.3fang.com/market/440600__1_2__5000_100000__1_1_1.html", # 住宅用地, 已成交, 5千到10万平米: 29页 "https://land.3fang.com/market/440600__1_2__100000_100000000__1_1_1.html", # 住宅用地, 已成交, 10万平米以上: 6页 "https://land.3fang.com/market/440600__1_3_____1_1_1.html", # 住宅用地, 流拍: 3页 "https://land.3fang.com/market/440600__2______1_1_1.html", # 商业用地: 19页 "https://land.3fang.com/market/440600__3_1_____1_1_1.html", # 工业用地, 未成交: 6页 "https://land.3fang.com/market/440600__3_2__0_40000__1_1_1.html", # 工业用地, 已成交, 4万平米以下: 32页 "https://land.3fang.com/market/440600__3_2__40000_100000000__1_1_1.html", # 工业用地, 已成交, 4万平米以上: 12页 "https://land.3fang.com/market/440600__3_3_____1_1_1.html", # 工业用地, 流拍: 1页 "https://land.3fang.com/market/440600__4______1_1_1.html", # 其他用地: 3页 ] meta_dict = { "page_type": "index", "total_pages": 0, "index_level": 0, } if self.use_proxy: proxies_dict = self.proxy_ip_pool() if 1 > len(proxies_dict): sys.exit(3) meta_dict["proxy"] = proxies_dict["http"] cookie_dict = dict([ pair.split("=", 1) for pair in self.cookie_string.split("; ") ]) self.cookie_dict = cookie_dict for url in urls: url_object = parse.urlparse(url) path_list = url_object.path.split("/") for one in path_list: if -1 == one.find(".html"): continue city_name = "" city_code_list = one.split("_") city_code = int( city_code_list[0]) if 0 < len(city_code_list) else 0 if 0 < city_code and str( city_code) in self.city_name_dict.keys(): city_name = self.city_name_dict[str(city_code)] if 1 > len(city_name): error_msg = f"{city_code} is NOT in self.city_name_dict.keys() ({self.city_name_dict.keys()})" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) sys.exit(4) break meta_dict["city"] = city_name # cookie_dict = self.change_cookies( cookie_dict ) yield scrapy.Request(url=url, cookies=cookie_dict, callback=self.parse_list_page, meta=meta_dict, dont_filter=True) # yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True ) elif "READ_CSV_AND_REDO" == self.run_purpose: english_city_name = { "佛山": "foshan", "广州": "guangzhou", } filename = "tudi_201808.csv" csv_file_path = os.path.join(self.crawled_dir, filename) url_list = [] city_list = [] try: with open(csv_file_path, newline="", encoding="utf-8") as csvfile: file_reader = csv.reader( csvfile) # , delimiter=' ', quotechar='|' for row in file_reader: if -1 < row[8].find("https:"): url_list.append(row[8]) city_list.append(row[13]) except Exception as ex: error_msg = f"cannot read csv file, Exception = {ex}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) meta_dict = { "page_type": "detailed", "total_pages": 1, } self.cookie_dict = dict([ pair.split("=", 1) for pair in self.cookie_string.split("; ") ]) if self.use_proxy: proxies_dict = self.proxy_ip_pool() meta_dict["proxy"] = proxies_dict["http"] for index, url in enumerate(url_list): chinese_city_name = city_list[index] meta_dict["city"] = english_city_name[chinese_city_name] yield scrapy.Request(url=url, cookies=self.cookie_dict, callback=self.parse_detailed_page, meta=meta_dict, dont_filter=True) break elif "CHECK_PROXY_IP" == self.run_purpose: now = int(time.time()) token = f"Guangzhou{str(now)}" m = hashlib.md5() m.update(token.encode(encoding='utf-8')) urls = [ f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}", ] if "DRAGONFLY" == self.proxy_agent: proxies_dict = CommonClass.get_proxies(proxy_dict={}) else: proxies_dict = ProxyAgent.get_xunlian_proxy_dict( headers={}, params_for_proxy_ip={}, setup_xunlian_dict={}, need_setup_xunlian=False, logger=self.logger) if 0 < len(proxies_dict): meta_dict = {"proxy": proxies_dict["http"]} for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug, meta=meta_dict) else: self.logger.error( f"Error! No proxy ip returns. {proxies_dict}") else: urls = [ "http://quotes.toscrape.com/page/1/", "http://quotes.toscrape.com/page/2/", ] for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug)
class DirectionamapSpider(scrapy.Spider): """ sys.exit code == 2 # missing CITY_LIST or missing input file(s) sys.exit code == 3 # classification file format error sys.exit code == 4 # already requested all xy points in city_list today """ name = "directionbaidu" root_path = "" log_dir = "" # debug = False # save_every_response = False crawled_dir = "" json_dir = "" output_folder_name = "" # output_file_format = "json" # base_uri = "" run_purpose = None overwrite_today = "" custom_settings = CommonClass.get_custom_settings_dict(spider=name) # crontab will start a new process in every 2 hours; therefore in 1 day, the crontab will start 12 times maximal_requests_of_one_crontab_process = 23 interval_between_requests = 300 request_counter = 0 last_4_requests = {} urls = [] def init_self_attributes(self): self.root_path = self.settings.get("PROJECT_PATH") self.log_dir = self.settings.get(name="LOG_DIR", default="") # self.debug = self.settings.get( name = "PROJECT_DEBUG", default=False ) # self.save_every_response = self.settings.get( name = "SAVE_EVERY_RESPONSE", default=False ) self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="") self.json_dir = self.settings.get(name="SAVED_JSON", default="") self.output_folder_name = self.settings.get(name="OUTPUT_FOLDER_NAME", default="") self.base_uri = self.settings.get(name="BASE_URI", default="") self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None) self.overwrite_today = self.settings.get(name="OVERWRITE_TODAY", default="") self.maximal_requests_of_one_crontab_process = self.settings.get( name="MAXIMAL_REQUESTS_OF_ONE_CRONTAB_PROCESS", default=23) self.interval_between_requests = self.settings.get( name="INTERVAL_BETWEEN_REQUESTS", default=300) xy_points = { "country_garden": "22.9299453776,113.2749357238", "baiyun_airport_departure": "23.3932641265,113.3085855889", # T1航站楼国内出发 "baiyun_airport_arrival": "23.3937931265,113.3068755889", # T1航站楼国内到达 "baoan_airport_departure": "22.6303448273,113.8207143453", # T3航站楼国内出发 "baoan_airport_arrival": "22.6296848273,113.8192343453", # T3航站楼国内到达 } query_dict = { "origin": xy_points["country_garden"], "destination": xy_points["baiyun_airport_departure"], "coord_type": "bd09ll", "ret_coordtype": "bd09ll", "tactics": 7, "alternatives": 0, "output": "json", "ak": "iL3ZmAje32Q6WrXgaWcBSZP0RZG1hekL", } # 0 == 碧桂园总部到白云机场;1 == 白云机场到碧桂园总部;2 == 总部到宝安机场;3 == 宝安机场到总部 query_list = [] query_list.append(query_dict) temp_dict = copy.deepcopy(query_dict) temp_dict["origin"] = xy_points["baiyun_airport_arrival"] temp_dict["destination"] = xy_points["country_garden"] query_list.append(temp_dict) temp_dict = copy.deepcopy(query_dict) temp_dict["origin"] = xy_points["country_garden"] temp_dict["destination"] = xy_points["baoan_airport_departure"] query_list.append(temp_dict) temp_dict = copy.deepcopy(query_dict) temp_dict["origin"] = xy_points["baoan_airport_arrival"] temp_dict["destination"] = xy_points["country_garden"] query_list.append(temp_dict) for one_query_dict in query_list: self.urls.append( f"{self.base_uri}?{parse.urlencode(one_query_dict)}") if 4 != len(self.urls): self.logger.error(f"self.urls length shall be 4 ({self.urls})") def check_dirs_and_files(self): if not os.path.isdir(self.crawled_dir): os.makedirs(self.crawled_dir) if not os.path.isdir(self.json_dir): os.makedirs(self.json_dir) def start_requests(self): self.init_self_attributes() self.check_dirs_and_files() if "READ_JSON_AND_WRITE_CSV" == self.run_purpose: one_url = "https://blog.csdn.net/qq_37193537/article/details/78987949" callback_func = self.read_json_and_parse yield scrapy.Request(url=one_url, callback=callback_func, dont_filter=True) else: timestamp_float = time.time() self.last_4_requests = { "request_time": timestamp_float, "requested_index": [ 0, 1, 2, 3, ] } callback_func = self.parse_json for index, one_url in enumerate(self.urls): meta_dict = { "preset_route": index, # 0 == 碧桂园总部到白云机场;1 == 白云机场到碧桂园总部;2 == 总部到宝安机场;3 == 宝安机场到总部 "redo": 0, } self.logger.info(f"{index}: requesting {one_url} ") yield scrapy.Request(url=one_url, callback=callback_func, meta=meta_dict, dont_filter=True) def get_url_according_to_preset_route(self, preset_route=101): # 由于有一个严重的bug直到20190619_2220才修补,导致在这之前的所有请求都是宝安机场到总部的(即preset_route == 3) baoan2headquarter = "http://api.map.baidu.com/direction/v2/driving?origin=22.6296848273%2C113.8192343453&destination=22.9299453776%2C113.2749357238&coord_type=bd09ll&ret_coordtype=bd09ll&tactics=7&alternatives=0&output=json&ak=iL3ZmAje32Q6WrXgaWcBSZP0RZG1hekL" if 0 < len(self.overwrite_today ) and "READ_JSON_AND_WRITE_CSV" == self.run_purpose: time_array = time.strptime(self.overwrite_today, "%Y%m%d") timestamp_overwrite_today = float(time.mktime(time_array)) time_array = time.strptime("20190619_222000", "%Y%m%d_%H%M%S") timestamp_bug_fixed = float(time.mktime(time_array)) if timestamp_overwrite_today < timestamp_bug_fixed: return baoan2headquarter if 3 == preset_route: return baoan2headquarter elif 2 == preset_route: return "http://api.map.baidu.com/direction/v2/driving?origin=22.9299453776%2C113.2749357238&destination=22.6303448273%2C113.8207143453&coord_type=bd09ll&ret_coordtype=bd09ll&tactics=7&alternatives=0&output=json&ak=iL3ZmAje32Q6WrXgaWcBSZP0RZG1hekL" elif 1 == preset_route: return "http://api.map.baidu.com/direction/v2/driving?origin=23.3937931265%2C113.3068755889&destination=22.9299453776%2C113.2749357238&coord_type=bd09ll&ret_coordtype=bd09ll&tactics=7&alternatives=0&output=json&ak=iL3ZmAje32Q6WrXgaWcBSZP0RZG1hekL" elif 0 == preset_route: return "http://api.map.baidu.com/direction/v2/driving?origin=22.9299453776%2C113.2749357238&destination=23.3932641265%2C113.3085855889&coord_type=bd09ll&ret_coordtype=bd09ll&tactics=7&alternatives=0&output=json&ak=iL3ZmAje32Q6WrXgaWcBSZP0RZG1hekL" return "" def read_json_and_parse(self, response): file_list = os.listdir(self.json_dir) # route0___0___20190615_234522.json for one_file in file_list: temp_list = one_file.split("___") preset_route = 0 now = "" if 2 < len(temp_list): preset_route = temp_list[0] preset_route = preset_route.lstrip("route") preset_route = CommonClass.find_digits_from_str( string=preset_route, return_all=False) preset_route = int(preset_route) now = temp_list[2] now = now.rstrip(".json") url = self.get_url_according_to_preset_route( preset_route=preset_route) json_file_path = os.path.join(self.json_dir, one_file) if os.path.isfile(json_file_path): try: doc = None with open(json_file_path, "rb") as f: doc = f.read().decode("utf-8", "ignore") if doc is None: self.logger.error( f"Error: cannot read html file {json_file_path}." ) continue text_dict = self.extract_text_dict_from_response_body( body=doc, preset_route=preset_route, now=now) if 0 < len(text_dict): json_selector = Selector(text=doc, type=None) loader = ItemLoader(item=DirectionbaiduItem(), selector=json_selector) loader = self.load_items_into_loader( loader=loader, text=text_dict, url=url, now=now) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}" ) def extract_text_dict_from_response_body(self, body="", preset_route=101, now=""): text_dict = {} json_dict = json.loads(body) result_dict = json_dict["result"] if "result" in json_dict.keys( ) else {} total = int( result_dict["total"]) if "total" in result_dict.keys() else 0 routes_list = result_dict["routes"] if "routes" in result_dict.keys( ) else [] selected_route_dict = {} if 1 < len(routes_list): for one_route_dict in routes_list: tag = one_route_dict["tag"] if "tag" in one_route_dict.keys( ) else "" if -1 < tag.find("推荐路线"): selected_route_dict = one_route_dict break elif 1 == len(routes_list): selected_route_dict = routes_list[0] # if no 推荐路线, just select the first route_dict if 1 < len(routes_list): selected_route_dict = routes_list[0] if 0 < len(selected_route_dict): tag = selected_route_dict[ "tag"] if "tag" in selected_route_dict.keys() else "" distance = selected_route_dict[ "distance"] if "distance" in selected_route_dict.keys() else 0 duration = selected_route_dict[ "duration"] if "duration" in selected_route_dict.keys() else 0 selected_path_steps = selected_route_dict[ "steps"] if "steps" in selected_route_dict.keys() else [] text_dict = { "preset_route": preset_route, "strategy": tag, "duration": duration, "distance": distance, "count": total, "paths": len(routes_list), "now": now, "selected_path_steps": selected_path_steps, } return text_dict def parse_json(self, response): status, message = self.save_json(response=response, page_type="json") now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") preset_route = -1 if 0 == status: try: meta_dict = response.meta preset_route = int(meta_dict["preset_route"]) text_dict = self.extract_text_dict_from_response_body( body=response.body, preset_route=preset_route, now=now) if 0 < len(text_dict): loader = ItemLoader(item=DirectionbaiduItem(), response=response) loader = self.load_items_into_loader(loader=loader, text=text_dict, url=response.url, now=now) yield loader.load_item() except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}" ) if -1 == preset_route: if hasattr(response, "meta"): meta_dict = response.meta if "preset_route" in meta_dict.keys(): preset_route = int(meta_dict["preset_route"]) if -1 < preset_route: received_all_4_requests_bool = self.check_this_preset_route( preset_route=preset_route) if not received_all_4_requests_bool and "redo" in response.meta.keys( ): delayed_index_list = self.get_delayed_response_more_than_1_minute( ) if 0 < len(delayed_index_list): request_result_bool = self.redo_requests( redo=response.meta["redo"]) # get data again after 5 minutes if self.request_counter < self.maximal_requests_of_one_crontab_process and received_all_4_requests_bool: while (self.check_time_interval()): time.sleep(10) self.request_counter += 1 now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") self.logger.info( f" requesting amap at {now} ( {self.request_counter} of { self.maximal_requests_of_one_crontab_process } )" ) self.last_4_requests = { "request_time": time.time(), "requested_index": [ 0, 1, 2, 3, ] } callback_func = self.parse_json for index, one_url in enumerate(self.urls): meta_dict = { "preset_route": index, "redo": 0, } self.logger.info(f"{index}: requesting {one_url} ") yield scrapy.Request(url=one_url, callback=callback_func, meta=meta_dict, dont_filter=True) def check_time_interval(self): if "request_time" not in self.last_4_requests.keys() or not isinstance( self.last_4_requests["request_time"], float): return False if time.time() - self.last_4_requests["request_time"] > float( self.interval_between_requests): return False return True def redo_requests(self, redo=-1): urls = [] index_list = [] if 1 > len(self.last_4_requests["requested_index"]) or 0 > redo: return False for one_index in self.last_4_requests["requested_index"]: urls.append(self.urls[one_index]) index_list.append(one_index) now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") for index, one_url in enumerate(urls): meta_dict = { "preset_route": index_list[index], "redo": redo + 1, } self.logger.info( f"[{now}] redo {index_list[index]}: requesting {one_url} ") yield scrapy.Request(url=one_url, callback=self.parse_json, meta=meta_dict, dont_filter=True) def get_delayed_response_more_than_1_minute(self): if "requested_index" not in self.last_4_requests.keys( ) or not isinstance(self.last_4_requests["requested_index"], list): return [] if "request_time" not in self.last_4_requests.keys() or not isinstance( self.last_4_requests["request_time"], float): return [] if time.time() - self.last_4_requests["request_time"] > 60.0: return self.last_4_requests["requested_index"] return [] def check_this_preset_route(self, preset_route=-1): if preset_route not in [ 0, 1, 2, 3, ]: return True if "request_time" not in self.last_4_requests.keys() or not isinstance( self.last_4_requests["request_time"], float): return True if "requested_index" not in self.last_4_requests.keys( ) or not isinstance(self.last_4_requests["requested_index"], list): return True # 4 minutes have passed, just return True if time.time() - self.last_4_requests["request_time"] > 240.0: return True # remove current preset_route if preset_route in self.last_4_requests["requested_index"]: self.last_4_requests["requested_index"].remove(preset_route) if 1 > len(self.last_4_requests["requested_index"]): return True # There are(is a) element(s) in self.last_4_requests["requested_index"] return False def load_items_into_loader(self, loader=None, text={}, url="", now=""): loader.add_value("url", url) loader.add_value("project", self.settings.get("BOT_NAME")) loader.add_value("spider", self.name) loader.add_value("server", socket.gethostname()) loader.add_value("date", now) loader.add_value("content", str(text)) loader.add_value("page_type", "json") return loader def save_json(self, response=None, page_type="json"): status = -4 if response is None or not hasattr(response, "body") or not hasattr( response, "url") or not hasattr(response, "meta"): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object" ) return (-1, f"wrong response object") meta_dict = response.meta preset_route = meta_dict[ "preset_route"] if "preset_route" in meta_dict.keys() else "" file_path = "" if "json" == page_type: json_dict = json.loads(response.body) status = json_dict["status"] if "status" in json_dict.keys( ) else "404" result_dict = json_dict["result"] if "result" in json_dict.keys( ) else {} routes_list = result_dict[ "routes"] if "routes" in result_dict.keys() else [] if isinstance(routes_list, list) and 0 < len(routes_list): now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") file_path = os.path.join( self.json_dir, f"route{preset_route}___{status}___{now}.json") status = int(status) else: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, wrong parameter page_type == {page_type} from {response.url}" ) return (-2, f"page_type can ONLY be json") return_msg = "0 count" if 0 < len(file_path): try: with open(file_path, 'wb') as f: f.write(response.body) except Exception as ex: self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to write response.body from {response.url}" ) return (status, f"failed to write json file") # not -3 return (status, return_msg)
def parse_detailed_response_field(self, response=None, city=""): text = {} if response is None: return text if "READ_HTML" == self.run_purpose and not isinstance( response, Selector): return text information_div = response.xpath("//div[@id='printData1']") title = information_div.xpath( "./div[@class='tit_box01']/text()").extract_first(default="") land_id = information_div.xpath( "./div[@class='menubox01 mt20']/span[@class='gray2']/text()" ).extract_first(default="") province_city = information_div.xpath( "string(./div[@class='menubox01 p0515']/div[@class='fl'])" ).extract() province_city = "___".join(province_city) if 0 < len(title): text["title"] = title if 0 < len(land_id): text["land_id"] = land_id if 0 < len(province_city): text["province_city"] = province_city key1 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03']/text()" ).extract_first(default="") if "土地基本信息" == key1: basic_info = {} tr_list1 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03']/following-sibling::table[@class='tablebox02 mt10']/tbody/tr" ) for index, one_tr in enumerate(tr_list1): string_list = one_tr.xpath("string(.)").extract() td_list = [] for one_str in string_list: cleaned_str = CommonClass.clean_string(string=one_str, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) td_list.append(cleaned_str.strip('\r')) basic_info[index] = "___".join(td_list) text[key1] = basic_info key2 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/text()" ).extract_first(default="") if "土地交易信息" == key2: trade_info = {} tr_list2 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/following-sibling::div[@class='banbox']/table[@class='tablebox02 mt10']/tbody/tr" ) for index, one_tr in enumerate(tr_list2): string_list = one_tr.xpath("string(.)").extract() td_list = [] for one_str in string_list: cleaned_str = CommonClass.clean_string(string=one_str, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) td_list.append(cleaned_str.strip('\r')) trade_info[index] = "___".join(td_list) text[key2] = trade_info # 20190730 cannot get 土地评估结果, todo ... # evaluation_div = response.xpath("//div[@id='divpg']") # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" ) # if "土地评估结果" == key3: # evaluation_dict = {} # tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr") # for index, one_tr in enumerate( tr_list3 ): # this_td = one_tr.xpath("./td") # if this_td is None: # string_list = one_tr.xpath("string(./th)").extract() # else: # td_list = one_tr.xpath("./td") # string_list = [] # for one_td in td_list: # unit = one_td.xpath("./text()").extract_first( default= "" ) # amount = one_td.xpath("./span/text()").extract_first( default= "" ) # string_list.append( f"{amount}___{unit}" ) # # this_td_str_list = one_td.xpath("string(.)").extract() # # string_list.extend( this_td_str_list ) # td_th_list = [] # for one_str in string_list: # cleaned_str = CommonClass.clean_string( string = one_str, char_to_remove = [ '\xa0', '\n', '\t', ' ',] ) # td_th_list.append( cleaned_str.strip('\r') ) # evaluation_dict[index] = "___".join( td_th_list ) # text[key3] = evaluation_dict # evaluation_div = response.xpath("//div[@id='divpg']") # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" ) # if "土地评估结果" == key3: # evaluation_dict = {} # th_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr/th") # string_list = th_list3.xpath("string(.)").extract() # evaluation_dict["fields"] = "___".join( string_list ) # tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr") # row2 = tr_list3[1].xpath("./td") # row2string = "" # str1 = row2[0].xpath("./text()").extract_first( default= "" ) # str2 = row2[1].xpath("string(.)").extract() # str2 = "___".join( str2 ) # str3amount = response.xpath("//span[@id='scbj_bpgj']") # str3unit = row2[2].xpath("./text()").extract_first( default= "" ) # str4amount = response.xpath("//span[@id='scbj_bSumPrice']") # str4amount = str4amount.get() # str3amount = str3amount.get() # str4unit = row2[3].xpath("./text()").extract_first( default= "" ) # str5 = row2[4].xpath("./a/@href").extract_first( default= "" ) # evaluation_dict[str1] = f"{str2}___{str3amount} {str3unit}___{str4amount} {str4unit}___{str5}" # row3 = tr_list3[2].xpath("./td") # row3str = row3.xpath("string(.)").extract() # evaluation_dict["假设开发法"] = "___".join( row3str ) # text[key3] = evaluation_dict if 0 < len(text): text["city"] = city return text
class FangSpider(scrapy.Spider): """ sys.exit code == 1 # wrong or missing RUN_PURPOSE sys.exit code == 2 # wrong or missing CRAWLED_DIR, SAVED_DETAIL_HTML, or SAVED_GAODE_JASON On 20190517 Peter re-write this spider for fixing bugs """ name = "fang" csv_filename = None root_path = "" run_purpose = None overwrite_today = "" crawled_dir = "" detail_html_dir = "" gaode_json_dir = "" csv_file_path = None custom_settings = CommonClass.get_custom_settings_dict(spider=name) def init_self_attributes(self): self.root_path = self.settings.get("PROJECT_PATH") self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None) if self.run_purpose is None: self.logger.error( f"missing RUN_PURPOSE ({self.run_purpose}) setting") sys.exit(1) self.overwrite_today = self.settings.get("OVERWRITE_TODAY", default="") self.debug = self.settings.get( name="PROJECT_DEBUG", default=False) # whether this run is for debugging if not hasattr(self, "overwrite_today") or 1 > len( self.overwrite_today) or self.overwrite_today is None: self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d") # set all paths self.crawled_dir = self.settings.get(name='CRAWLED_DIR', default="") self.detail_html_dir = self.settings.get(name='SAVED_DETAIL_HTML', default="") self.gaode_json_dir = self.settings.get(name='SAVED_GAODE_JASON', default="") self.csv_file_path = os.path.join( self.crawled_dir, f"fang_zu{self.overwrite_today}.csv") if 1 > len(self.crawled_dir) or 1 > len( self.detail_html_dir) or 1 > len(self.gaode_json_dir): self.logger.info( f"missing CRAWLED_DIR ({self.crawled_dir}), SAVED_DETAIL_HTML ({self.detail_html_dir}), or SAVED_GAODE_JASON ({self.gaode_json_dir}) setting(s)" ) sys.exit(2) def make_dirs(self): # even cache is used, we save all html files; here we make these 3 dirs if they do not exist if not os.path.isdir(self.crawled_dir): os.makedirs(self.crawled_dir) if not os.path.isdir(self.detail_html_dir): os.makedirs(self.detail_html_dir) if not os.path.isdir(self.gaode_json_dir): os.makedirs(self.gaode_json_dir) def start_requests(self): self.init_self_attributes() self.make_dirs() if "READ_HTML" == self.run_purpose: url = 'http://quotes.toscrape.com/page/1/' yield scrapy.Request(url=url, callback=self.read_and_parse) elif "PRODUCTION_RUN" == self.run_purpose: city_list = self.settings.get("CITY_LIST", default=[]) number_day_of_this_year = datetime.datetime.now().timetuple( ).tm_yday # type == int seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3) if seperate_into_days > len(city_list): seperate_into_days = len(city_list) batch_count = math.ceil(len(city_list) / seperate_into_days) today_batch = number_day_of_this_year % seperate_into_days start_index = today_batch * batch_count - 1 end_index = (today_batch + 1) * batch_count urls = [] for index, city in enumerate(city_list): if (start_index < index) and (index < end_index): urls.append(f"https://{city}.zu.fang.com/") for url in urls: yield scrapy.Request(url=url, callback=self.parse) else: urls = [ 'http://quotes.toscrape.com/page/1/', 'http://quotes.toscrape.com/page/2/', 'http://quotes.toscrape.com/page/3/', 'http://quotes.toscrape.com/page/4/', ] for url in urls: yield scrapy.Request(url=url, callback=self.do_nothing_for_debug) def do_nothing_for_debug(self, response): self.logger.info( f"inside Method do_nothing_for_debug of Class FangSpider. url = {response.url}" ) def read_and_parse(self, response): file_list = os.listdir(self.detail_html_dir) for one_file in file_list: if -1 < one_file.find("index"): self.logger.info(f"ignoring {one_file}") else: temp_list = one_file.split("_") apt_id = 0 city_name = "" if 1 < len(temp_list): apt_id = temp_list[1] city_name = temp_list[0] url = f"https://{city_name}.zu.fang.com/house/" html_file = os.path.join(self.detail_html_dir, one_file) if os.path.isfile(html_file): doc = None with open(html_file, 'rb') as f: doc = f.read().decode('gb2312', 'ignore') if doc is None: self.logger.error( f"Error: cannot read html file {html_file}.") continue response = Selector(text=doc, type="html") text = self.parse_response_field(response=response, city_name=city_name, apt_id=apt_id) try: response_for_items = TextResponse( url=url, status=200, body=bytes(doc, encoding="utf-8")) loader = ItemLoader(item=FangItem(), response=response_for_items) loader = self.load_items_into_loader(loader=loader, text=text, url=url) yield loader.load_item() except Exception as ex: print( f"Error happened during parsing in Method read_and_parse of Class FangSpider. Exception = {ex}" ) def load_items_into_loader(self, loader=None, text={}, url=""): loader.add_value('content', str(text)) # , encoding="utf-8" loader.add_value('page_type', "detailed") # record housekeeping fields loader.add_value('url', url) loader.add_value('project', self.settings.get('BOT_NAME')) loader.add_value('spider', self.name) loader.add_value('server', socket.gethostname()) loader.add_value('date', datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) return loader def parse_response_field(self, response=None, city_name="", apt_id=""): text = {} if response is None: return text if "READ_HTML" == self.run_purpose and not isinstance( response, Selector): return text address_list = response.xpath( '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]') address = address_list[0].xpath( '//div[@class="rcont"]/a/text()').extract_first( default="") if 0 < len(address_list) else "" location_list = response.xpath( '//div[@class="trl-item2 clearfix"]/div[@class="rcont address_zf"]/a/text()' ).extract() if location_list is None or 1 > len(location_list): location_list = response.xpath( '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]/a[@class="link-under"]/text()' ).extract() address_list = response.xpath( '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]/a[not(@class)]/text()' ).extract() address = "" if 0 < len(address_list): address = ";".join(address_list) location_list.reverse() location = "" for one_location in location_list: location += one_location if 0 < len(address): address = CommonClass.clean_string(string=address, char_to_remove=[ '\r', '\n', '\t', '"', ]) if 0 < len(location): location = CommonClass.clean_string(string=location, char_to_remove=[ '\r', '\n', '\t', '"', ]) rent_div = response.xpath( '//div[@class="tr-line clearfix zf_new_title"]/div[@class="trl-item sty1 rel"]' ) if rent_div is None or 1 > len(rent_div): rent_div = response.xpath( '//div[@class="tr-line clearfix zf_new_title"]/div[@class="trl-item sty1"]' ) temp = rent_div.css('::text').extract() rent_list = [] for one_rent in temp: temp2 = one_rent.replace("\n", " ") temp2 = temp2.strip() if 0 < len(temp2): rent_list.append(temp2) while "" in rent_list: rent_list.remove("") rent = "" if 1 < len(rent_list): rent = rent_list[0] + rent_list[1] rent_type_div = response.xpath( '//div[@class="trl-item1 w146"]/div[@class="tt"]') rent_type = rent_type_div[0].css('div::text').extract_first( default="") if 0 < len(rent_type_div) else "" facing = rent_type_div[1].css('div::text').extract_first( default="") if 1 < len(rent_type_div) else "" apt_type_div = response.xpath( '//div[@class="trl-item1 w182"]/div[@class="tt"]') apt_type = apt_type_div[0].css('div::text').extract_first( default="") if 0 < len(apt_type_div) else "" floor = apt_type_div[1].css('div::text').extract_first( default="") if 1 < len(apt_type_div) else "" area_div = response.xpath( '//div[@class="trl-item1 w132"]/div[@class="tt"]') area = area_div[0].css('div::text').extract_first( default="") if 0 < len(area_div) else "" decorate = area_div[1].css('div::text').extract_first( default="") if 1 < len(area_div) else "" update_date_spans = response.xpath('//p[@class="gray9 fybh-zf"]/span') update_date = "" if 1 < len(update_date_spans): update_date = update_date_spans[1].css("::text").extract_first( default="") text = { "rent_id": f"{city_name}_{apt_id.strip()}_{self.overwrite_today}", "location": location.strip(), "address": address.strip(), "rent": rent.strip(), "rent_type": rent_type.strip(), "facing": facing.strip(), "apt_type": apt_type.strip(), "floor": floor.strip(), "area": area.strip(), "decorate": decorate.strip(), "update_date": update_date.strip(), } return text def parse_one_detail_page(self, response=None, apt_id=0, city_name=""): self.logger.info( f"inside Method parse_one_detail_page (todo...) of Class FangSpider. url = {response.url}; apt_id = {apt_id}; city_name = {city_name}" ) def url_contains_error(self, result_obj_path=""): if not isinstance(result_obj_path, str) or 1 > len(result_obj_path): return False path_fragment_list = result_obj_path.split("/") if 1 > len(path_fragment_list): return False # https://sz.esf.fang.com/staticsearchlist/Error/Error404?aspxerrorpath=/house-a013057/i330/i330 for one in path_fragment_list: if -1 < one.find("Error") or -1 < one.find( "Error404") or -1 < one.find("staticsearchlist"): self.logger.info( f"Error! Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {result_obj_path}" ) return True # http://search.fang.com/captcha-verify/redirect?h=https://wuxi.zu.fang.com/chuzu/3_166962621_1.htm for one in path_fragment_list: if -1 < one.find("captcha") or -1 < one.find("verify"): self.logger.info( f"Need captcha-verify! Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {result_obj_path}" ) return True return False def parse(self, response): url = response.url # detailed page: https://gz.zu.fang.com/chuzu/3_238110671_1.htm?channel=3,8 # list page (first): https://gz.zu.fang.com/ # list page (next): https://gz.zu.fang.com/house/i32/ result_obj = parse.urlparse(url) has_url_error = self.url_contains_error( result_obj_path=result_obj.path) if has_url_error: return False detail_page = False now = datetime.datetime.now() url_list = url.split("/") while "" in url_list: url_list.remove("") html_filename = "{}.html".format(now.strftime("%Y%m%d_%H%M%S")) today = f'{now.strftime("%Y%m%d")}' apt_id = "" city_name = "" if 0 < len(url_list): last_part = url_list[len(url_list) - 1] temp_list = url_list[1].split( "." ) # empty "" element has been removed; "gz.zu.fang.com" == url_list[1] # not a strong code city_name = temp_list[0] if -1 < last_part.find(".htm"): detail_page = True temp = last_part.split("_") if 1 < len(temp): apt_id = f"{temp[1]}" html_filename = f"{city_name}_{apt_id}_{today}.html" elif -1 < last_part.find("fang.com"): html_filename = f"{city_name}_index1_{today}.html" else: page = last_part[2:] html_filename = f"{city_name}_index{page}_{today}.html" html_file_path = os.path.join(self.detail_html_dir, html_filename) with open(html_file_path, 'wb') as f: f.write(response.body) if detail_page: text = self.parse_response_field(response=response, city_name=city_name, apt_id=apt_id) try: loader = ItemLoader(item=FangItem(), response=response) loader = self.load_items_into_loader(loader=loader, text=text, url=url) yield loader.load_item() except Exception as ex: print( f"Error happened during parsing in Method read_and_parse of Class FangSpider. Exception = {ex}" ) else: url_list = url.split("fang.com") base_url = "" if 0 < len(url_list): base_url = f"{url_list[0]}fang.com" total_pages = response.xpath( '//div[@class="fanye"]/a/@href').extract() if 0 < len(total_pages): last_page = total_pages[len(total_pages) - 1] # /house/i33/ last_page = last_page[9:] last_page = last_page.strip('/') if last_page is not None and 0 < len(last_page): for i in range(int(last_page) - 1): next_url = base_url + f'/house/i3{i + 2}/' self.logger.info( f"\ngoing to the next list page at {next_url}") yield response.follow(next_url, self.parse) apartments = response.xpath( '//dl[@class="list hiddenMap rel"]/dt[@class="img rel floatl"]' ) for one_apt in apartments: next_url = base_url + one_apt.css( "a::attr(href)").extract_first(default='') self.logger.info( f"\ngoing to the next detail page at {next_url}") yield response.follow(next_url, self.parse)