def extract_link_list(self, response=None): record_list = [] tr_list = response.xpath('//table[@class="resultTableC"]/tbody/tr') for one_tr in tr_list: try: detailed_page_link = one_tr.xpath( './tr/td/a/@href').extract_first(default="") detailed_page_link = CommonClass.clean_string( string=detailed_page_link, char_to_remove=[ '\r', '\n', '\t', ' ', ]) td_list = one_tr.xpath('./td') value_list = [] for one_td in td_list: value_list.append( one_td.xpath("./a/text()").extract_first(default="")) # 检查这7个字段是否都是空字符串 if 7 == len(value_list): not_empty = False for one_value in value_list: if isinstance(one_value, str) and 0 < len(one_value): not_empty = True break if 7 == len(value_list) and not_empty: this_record = { "序号": value_list[0], "项目名称": value_list[1], "开发商": value_list[2], "预售证": value_list[3], "项目地址": value_list[4], "住宅已售套数": value_list[5], "住宅未售套数": value_list[6], "详情链接": detailed_page_link, } record_list.append(this_record) elif 7 != len(value_list): error_msg = f"value_list ({value_list}) has length other than 7" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) except Exception as ex: error_msg = f"xpath error! Exception = {ex}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) if 1 > len(record_list): error_msg = f"Fail to extract links from {response.url}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return record_list
def replace_one_node_text(self, node=None, this_node_class_name20190505=""): if node is None: return "" this_node_class_name = node.xpath("./@class").extract_first(default="") # the following 7 lines are for updated anticrawl methods on 20190505 this_node_get_text = node.get() if this_node_get_text is not None and 0 < len(this_node_get_text): this_node_get_text5 = this_node_get_text.encode( 'unicode_escape').decode('utf-8') if 6 == len(this_node_get_text5) and '\\' == this_node_get_text5[ 0] and 'u' == this_node_get_text5[ 1] and -1 < this_node_class_name20190505.find( "shopNum"): key = this_node_get_text5[2:] if key in self.database_anticrawl20190505_table.keys(): # self.logger.warning( f"{this_node_get_text5} ==> {key}; found in {self.database_anticrawl20190505_table[ key ]}" ) return self.database_anticrawl20190505_table[key] # has no class as shopNum: ¥ ==> \uffe5 not_in_class_mapping_dict = False for index, key in enumerate(self.class_mapping_dict): this_dict = self.class_mapping_dict[key] key_length = this_dict['key_length'] all_keys = this_dict['all_keys'] if key_length < len( this_node_class_name ) and this_node_class_name[:key_length] in all_keys: value = this_dict['class_mapping'][ this_node_class_name] if this_node_class_name in this_dict[ 'class_mapping'].keys() else "" if 0 < len(value): return value else: not_in_class_mapping_dict = True self.logger.error( f"cannot find {this_node_class_name} in saved mapping class {key}." ) if not_in_class_mapping_dict: return "" else: temp = CommonClass.clean_string(string=node.get(), char_to_remove=[ '\r', '\n', '\t', ' ', ]) return temp
def parse_detailed_response_field(self, response=None, city=""): text = {} if response is None: return text if "READ_HTML" == self.run_purpose and not isinstance( response, Selector): return text information_div = response.xpath("//div[@id='printData1']") title = information_div.xpath( "./div[@class='tit_box01']/text()").extract_first(default="") land_id = information_div.xpath( "./div[@class='menubox01 mt20']/span[@class='gray2']/text()" ).extract_first(default="") province_city = information_div.xpath( "string(./div[@class='menubox01 p0515']/div[@class='fl'])" ).extract() province_city = "___".join(province_city) if 0 < len(title): text["title"] = title if 0 < len(land_id): text["land_id"] = land_id if 0 < len(province_city): text["province_city"] = province_city key1 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03']/text()" ).extract_first(default="") if "土地基本信息" == key1: basic_info = {} tr_list1 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03']/following-sibling::table[@class='tablebox02 mt10']/tbody/tr" ) for index, one_tr in enumerate(tr_list1): string_list = one_tr.xpath("string(.)").extract() td_list = [] for one_str in string_list: cleaned_str = CommonClass.clean_string(string=one_str, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) td_list.append(cleaned_str.strip('\r')) basic_info[index] = "___".join(td_list) text[key1] = basic_info key2 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/text()" ).extract_first(default="") if "土地交易信息" == key2: trade_info = {} tr_list2 = information_div.xpath( "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/following-sibling::div[@class='banbox']/table[@class='tablebox02 mt10']/tbody/tr" ) for index, one_tr in enumerate(tr_list2): string_list = one_tr.xpath("string(.)").extract() td_list = [] for one_str in string_list: cleaned_str = CommonClass.clean_string(string=one_str, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) td_list.append(cleaned_str.strip('\r')) trade_info[index] = "___".join(td_list) text[key2] = trade_info # 20190730 cannot get 土地评估结果, todo ... # evaluation_div = response.xpath("//div[@id='divpg']") # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" ) # if "土地评估结果" == key3: # evaluation_dict = {} # tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr") # for index, one_tr in enumerate( tr_list3 ): # this_td = one_tr.xpath("./td") # if this_td is None: # string_list = one_tr.xpath("string(./th)").extract() # else: # td_list = one_tr.xpath("./td") # string_list = [] # for one_td in td_list: # unit = one_td.xpath("./text()").extract_first( default= "" ) # amount = one_td.xpath("./span/text()").extract_first( default= "" ) # string_list.append( f"{amount}___{unit}" ) # # this_td_str_list = one_td.xpath("string(.)").extract() # # string_list.extend( this_td_str_list ) # td_th_list = [] # for one_str in string_list: # cleaned_str = CommonClass.clean_string( string = one_str, char_to_remove = [ '\xa0', '\n', '\t', ' ',] ) # td_th_list.append( cleaned_str.strip('\r') ) # evaluation_dict[index] = "___".join( td_th_list ) # text[key3] = evaluation_dict # evaluation_div = response.xpath("//div[@id='divpg']") # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" ) # if "土地评估结果" == key3: # evaluation_dict = {} # th_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr/th") # string_list = th_list3.xpath("string(.)").extract() # evaluation_dict["fields"] = "___".join( string_list ) # tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr") # row2 = tr_list3[1].xpath("./td") # row2string = "" # str1 = row2[0].xpath("./text()").extract_first( default= "" ) # str2 = row2[1].xpath("string(.)").extract() # str2 = "___".join( str2 ) # str3amount = response.xpath("//span[@id='scbj_bpgj']") # str3unit = row2[2].xpath("./text()").extract_first( default= "" ) # str4amount = response.xpath("//span[@id='scbj_bSumPrice']") # str4amount = str4amount.get() # str3amount = str3amount.get() # str4unit = row2[3].xpath("./text()").extract_first( default= "" ) # str5 = row2[4].xpath("./a/@href").extract_first( default= "" ) # evaluation_dict[str1] = f"{str2}___{str3amount} {str3unit}___{str4amount} {str4unit}___{str5}" # row3 = tr_list3[2].xpath("./td") # row3str = row3.xpath("string(.)").extract() # evaluation_dict["假设开发法"] = "___".join( row3str ) # text[key3] = evaluation_dict if 0 < len(text): text["city"] = city return text
def parse_one_bus_route_fields(self, response=None, city_str="", route_str=""): if response is None: return {} try: url = response.url url_obj = parse.urlparse(url) bus_route_id = url_obj.path.strip("/") bus_line_div = response.xpath("//div[@id='bus_line']") bus_line_information_div = bus_line_div.xpath( "./div[@class='bus_line_information ']/div[@class='bus_i_content']" ) bus_route_title = bus_line_information_div.xpath( "./div[@class='bus_i_t1']/h1/text()").extract_first(default="") bus_route_title = CommonClass.clean_string(string=bus_route_title, char_to_remove=[ ' ', ' ', '\xa0', ' ', ]) bus_route_district = bus_line_information_div.xpath( "./div[@class='bus_i_t1']/a[@class='bus_i_t2']/text()" ).extract_first(default="") bus_route_info_list = bus_line_information_div.xpath( "./p[@class='bus_i_t4']/text()").extract() bus_route_info_str = "" if 0 < len(bus_route_info_list): bus_route_info_str = "___".join(bus_route_info_list) bus_operation_interval_str = bus_line_div.xpath( "./div[@class='bus_label ']/p[@class='bus_label_t2']/text()" ).extract_first(default="") bus_direction_dict = {} all_way_div_list = bus_line_div.xpath( "./div[@class='bus_line_top ']") for index, one_way_div in enumerate(all_way_div_list): one_way_name_text_list = one_way_div.xpath( "./div/strong/text()").extract() one_way_name = "___".join(one_way_name_text_list) if 0 < len( one_way_name_text_list) else "" span_text_list = one_way_div.xpath("./span/text()").extract() one_way_stop_number = "___".join( span_text_list) if 0 < len(span_text_list) else "" if 0 < len(one_way_stop_number): one_way_stop_number = CommonClass.clean_string( string=one_way_stop_number, char_to_remove=[ ' ', ' ', '\xa0', ]) temp_dict = { "one_way_name": one_way_name, "one_way_stop_number": one_way_stop_number, } bus_direction_dict[index] = temp_dict bus_route_stop_round_trip_list = bus_line_div.xpath( "./div[@class='bus_line_site ']") for index, one_direction in enumerate( bus_route_stop_round_trip_list): stop_sequence_list = one_direction.xpath( "./div[@class='bus_site_layer']/div/i/text()").extract() stop_name_list = one_direction.xpath( "./div[@class='bus_site_layer']/div/a/text()").extract() if len(stop_name_list) == len(stop_sequence_list): temp_list = [] for stop_name_index, stop_name in enumerate( stop_name_list): temp_list.append( f"{stop_sequence_list[stop_name_index]}___{stop_name}" ) if index in bus_direction_dict.keys(): bus_direction_dict[index]["stops"] = temp_list else: bus_direction_dict[index] = {"stops": temp_list} text_dict = { "route_title": bus_route_title.strip(), "city": city_str, "route_name": route_str, "route_id": bus_route_id.strip(), "route_uri": url, "route_district": bus_route_district.strip(), "route_info": bus_route_info_str.strip(), "operation_interval": bus_operation_interval_str.strip(), "bus_directions": bus_direction_dict, } return text_dict except Exception as ex: error_msg = f"Error happened during parsing. Exception = {ex}" self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return {}
def parse_detailed_response_field(self, response=None, city="", apt_id=""): text = {} if response is None: return text if "READ_HTML" == self.run_purpose and not isinstance( response, Selector): return text title = response.xpath("//div[@id='lpname']/h1/text()").extract_first( default="") if 1 > len(title): title = response.xpath( "//div[@class='tab-cont clearfix']/div[@class='title rel']/h1[@class='title floatl']/text()" ).extract_first(default="") title_right_box = response.xpath("//div[@class='tab-cont-right']") price_div = title_right_box.xpath( "./div[@class='tr-line clearfix zf_new_title']/div[@class='trl-item_top']/div[@class='rel floatl']/preceding-sibling::div" ) price_list = price_div.xpath("string(.)").extract() price = "___".join(price_list) # extract features feature_div = title_right_box.xpath( "./div[@class='tr-line clearfix']/div[contains(@class,'trl-item1')]" ) feature_dict = {} for one_item in feature_div: key = one_item.xpath( "./div[@class='font14']/text()").extract_first(default="") value = one_item.xpath("./div[@class='tt']/text()").extract_first( default="") if 0 < len(key): feature_dict[key] = CommonClass.clean_string(string=value, char_to_remove=[ '\r', '\n', '\t', ' ', ]) # extract location information location_div = title_right_box.xpath( "./div[@class='tr-line']/div[@class='trl-item2 clearfix']") location_dict = {} for one_location in location_div: key = one_location.xpath( "./div[@class='lab']/text()").extract_first(default="") value_list = one_location.xpath( "string(./div[@class='rcont'])").extract() temp_list = [] for one_value in value_list: temp = CommonClass.clean_string(string=one_value, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) temp_list.append(temp.strip('\r')) # keep \r if 0 < len(key): key = CommonClass.clean_string(string=key, char_to_remove=[ '\u2003', '\xa0', '\n', '\t', ' ', ]) location_dict[key] = "___".join(temp_list) information_box = response.xpath( "//div[@class='content-item fydes-item']") information_title_list = information_box.xpath( "string(./div[@class='title'])").extract() information_title = "___".join( information_title_list) if 0 < len(information_title_list) else "" information1div = information_box.xpath( "./div[@class='cont clearfix']/div[@class='text-item clearfix']") information_dict = {} for one_item in information1div: key = one_item.xpath("./span[@class='lab']/text()").extract_first( default="") value_list = one_item.xpath( "string(./span[@class='rcont'])").extract() temp_list = [] for one_value in value_list: temp = CommonClass.clean_string(string=one_value, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) temp_list.append(temp.strip('\r')) if 0 < len(key): information_dict[key] = "___".join(temp_list) community_box1 = response.xpath("//div[@id='xq_message']") community_title = community_box1.xpath("./text()").extract_first( default="") community_title = CommonClass.clean_string(string=community_title, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) community_dict = { "title": community_title.strip('\r'), } community_box2 = community_box1.xpath("./following-sibling::div") community_box2line1 = community_box2.xpath( "./div[@class='topt clearfix']") line1_list = community_box2line1.xpath( "./div[@class='text-item clearfix']") for one_item in line1_list: key = one_item.xpath("./span[@class='lab']/text()").extract_first( default="") value_list = one_item.xpath( "string(./span[@class='rcont'])").extract() if 0 < len(key): community_dict[key] = "___".join(value_list) community_box2line2 = community_box2line1.xpath( "./following-sibling::div") line2_list = community_box2line2.xpath( "./div[@class='text-item clearfix']") for one_item in line2_list: key = one_item.xpath("./span[@class='lab']/text()").extract_first( default="") value = one_item.xpath( "./span[@class='rcont ']/text()").extract_first(default="") if 0 < len(key): key = CommonClass.clean_string(string=key, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) community_dict[key] = CommonClass.clean_string(string=value, char_to_remove=[ '\xa0', '\n', '\t', ' ', '\r', ]) community_box2line3 = community_box2line2.xpath( "./following-sibling::div") community_box2line3key = community_box2line3.xpath( "./div[@class='text-item']/span[@class='lab']/text()" ).extract_first(default="") community_box2line3value = community_box2line3.xpath( "string(./div[@class='text-item']/span[@class='rcont'])").extract( ) temp_list = [] for one_value in community_box2line3value: temp = CommonClass.clean_string(string=one_value, char_to_remove=[ '\xa0', '\n', '\t', ' ', ]) temp = temp.strip('\r') if 0 < len(temp): temp_list.append(temp) if 0 < len(community_box2line3key): community_dict[community_box2line3key] = "".join(temp_list) text = { "title": title.strip(), "price": price.strip(), "feature": feature_dict, "location": location_dict, "information": information_dict, "community": community_dict, "city": city, "apt_id": apt_id, } return text
def parse_response_field(self, response=None, city_name="", apt_id=""): text = {} if response is None: return text if "READ_HTML" == self.run_purpose and not isinstance( response, Selector): return text address_list = response.xpath( '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]') address = address_list[0].xpath( '//div[@class="rcont"]/a/text()').extract_first( default="") if 0 < len(address_list) else "" location_list = response.xpath( '//div[@class="trl-item2 clearfix"]/div[@class="rcont address_zf"]/a/text()' ).extract() if location_list is None or 1 > len(location_list): location_list = response.xpath( '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]/a[@class="link-under"]/text()' ).extract() address_list = response.xpath( '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]/a[not(@class)]/text()' ).extract() address = "" if 0 < len(address_list): address = ";".join(address_list) location_list.reverse() location = "" for one_location in location_list: location += one_location if 0 < len(address): address = CommonClass.clean_string(string=address, char_to_remove=[ '\r', '\n', '\t', '"', ]) if 0 < len(location): location = CommonClass.clean_string(string=location, char_to_remove=[ '\r', '\n', '\t', '"', ]) rent_div = response.xpath( '//div[@class="tr-line clearfix zf_new_title"]/div[@class="trl-item sty1 rel"]' ) if rent_div is None or 1 > len(rent_div): rent_div = response.xpath( '//div[@class="tr-line clearfix zf_new_title"]/div[@class="trl-item sty1"]' ) temp = rent_div.css('::text').extract() rent_list = [] for one_rent in temp: temp2 = one_rent.replace("\n", " ") temp2 = temp2.strip() if 0 < len(temp2): rent_list.append(temp2) while "" in rent_list: rent_list.remove("") rent = "" if 1 < len(rent_list): rent = rent_list[0] + rent_list[1] rent_type_div = response.xpath( '//div[@class="trl-item1 w146"]/div[@class="tt"]') rent_type = rent_type_div[0].css('div::text').extract_first( default="") if 0 < len(rent_type_div) else "" facing = rent_type_div[1].css('div::text').extract_first( default="") if 1 < len(rent_type_div) else "" apt_type_div = response.xpath( '//div[@class="trl-item1 w182"]/div[@class="tt"]') apt_type = apt_type_div[0].css('div::text').extract_first( default="") if 0 < len(apt_type_div) else "" floor = apt_type_div[1].css('div::text').extract_first( default="") if 1 < len(apt_type_div) else "" area_div = response.xpath( '//div[@class="trl-item1 w132"]/div[@class="tt"]') area = area_div[0].css('div::text').extract_first( default="") if 0 < len(area_div) else "" decorate = area_div[1].css('div::text').extract_first( default="") if 1 < len(area_div) else "" update_date_spans = response.xpath('//p[@class="gray9 fybh-zf"]/span') update_date = "" if 1 < len(update_date_spans): update_date = update_date_spans[1].css("::text").extract_first( default="") text = { "rent_id": f"{city_name}_{apt_id.strip()}_{self.overwrite_today}", "location": location.strip(), "address": address.strip(), "rent": rent.strip(), "rent_type": rent_type.strip(), "facing": facing.strip(), "apt_type": apt_type.strip(), "floor": floor.strip(), "area": area.strip(), "decorate": decorate.strip(), "update_date": update_date.strip(), } return text
def format_css(self, css=""): return_dict = {} if css is None or 1 > len(css): return return_dict csslen = len(css) i = 0 xy_dict = {} svg_file_names = {} start = 0 skip_font_face = False while i < csslen: if css[i] == '{': key = css[start:i] if -1 < key.find("."): key = CommonClass.clean_string( string = key, char_to_remove = ['\r', '\n', '\t', '.', ' ',] ) elif -1 < key.find("class^="): key = CommonClass.clean_string( string = key, char_to_remove = ['\r', '\n', '\t', '.', ' ', ']', '"',] ) key_list = key.split("[class^=") if 2 == len(key_list): key = f"{key_list[0]}___{key_list[1]}" else: self.logger.error( f"Error! key_list = {key_list}; key = {key}" ) break elif -1 < key.find( "@font-face" ): skip_font_face = True else: self.logger.error( f"Error! key = {key}" ) break value = None i += 1 start = i elif css[i] == '}': value = css[start:i] value = CommonClass.clean_string( string = value, char_to_remove = ['\r', '\n', '\t', ] ) if key is None or 1 > len(key): self.logger.error( f"Error! key is None. value = {value}" ) break if -1 < value.find( "background:" ): value_list = value.split(" ") value_list = CommonClass.remove_0_len_element( list4remove = value_list ) if 2 == len( value_list ): x_str = CommonClass.clean_string( string = value_list[0], char_to_remove = ['background:', 'px', ] ) y_str = CommonClass.clean_string( string = value_list[1], char_to_remove = ['px;', ] ) x = float(x_str) y = float(y_str) # x, y could equal to 0.0 xy_dict[ key ] = { 'x':x, 'y':y, } key = None else: self.logger.error( f"Wrong value_list len. value = {value}; value_list = {value_list}" ) break elif -1 < value.find( "background-image:" ): searchObj = re.search( r'url\((.*?)\)', value, re.M ) if searchObj is None: self.logger.error( f"url not found. value = {value}" ) break temp = searchObj.group() temp_list = temp.split("/") temp = CommonClass.clean_string( string = temp, char_to_remove = ["url(", ")"] ) key_list = key.split("___") if 2 != len( key_list ): self.logger.error( f"Error! len of key_list is NOT 2. key_list = {key_list}; value = {value} " ) break if key_list[1] in xy_dict.keys(): self.logger.error( f"Error! xy_dict has key {key_list[1]}; value = {xy_dict[ key_list[1] ]}; value_list = {value_list}" ) self.logger.info( xy_dict ) break temp_dict = { 'element': key_list[0], 'filename': (temp_list[-1]).replace(')', ''), 'url': temp, } svg_file_names[ key_list[1] ] = temp_dict if 1 > self.key_length: self.key_length = len(key_list[1]) key = None elif skip_font_face: self.logger.warning( f"@font-face skipped" ) skip_font_face = False else: self.logger.warning( f"background-image: not found. value = {value}; i = {i}" ) # break # do NOT use break! just log it i += 1 start = i else: i += 1 return_dict['xy_dict'] = xy_dict return_dict['svg_file_names'] = svg_file_names return return_dict
def get_parse_dict_on_list_page(self, one_li=None, channel=""): """the html pages in different channels have different xpath return the right dict according to the input channel self.database_common_channel_list_table includes all channels but 'hotel' and 'ch70' """ this_page_xpath = {} this_page_dict = {} need_clean = [] use_extract = [] need_split_and_clean = [] if channel in self.database_common_channel_list_table: use_extract = ['group_deal_list'] this_page_xpath = { 'title': "./div[@class='txt']/div[@class='tit']/a/h4/text()", 'shop_id': "./div[@class='txt']/div[@class='tit']/a/@data-shopid", 'star': "./div[@class='txt']/div[@class='comment']/span[contains(@class, 'sml-rank-stars')]/@title", 'group_deal': "./div/a[@data-click-name='shop_info_groupdeal_click']/@title", 'group_deal_list': "./div[@class='svr-info']/div/a[@data-click-name='shop_info_groupdeal_click']/@title", # group_deal_list found in [ 'ch10', 'ch15', 'ch30', 'ch45', 'ch50', 'ch65', 'ch75', 'ch80', 'ch85', 'ch95', ]: 'address': "./div/a[@data-click-name='shop_map_click']/@data-address", 'out_of_business': "./div[@class='txt']/div[@class='tit']/span[@class='istopTrade']/text()", } if 'ch10' == channel: need_split_and_clean = ['recommended_dishes'] this_page_xpath[ 'takeway'] = "./div/a[@data-click-name='shop_info_takeway_click']/@title" this_page_xpath[ 'recommended_dishes'] = "string(./div[@class='txt']/div[@class='recommend'])" elif channel in ['ch30', 'ch25']: this_page_xpath[ 'group_deal'] = "./div[@class='txt']/div[@class='tit']/div/a[@class='igroup']/@title" elif channel in [ 'ch70', ]: this_page_xpath = { 'title': "./div[@class='info baby-info']/p[@class='title']/a[@class='shopname']/text()", 'branch': "./div[@class='info baby-info']/p[@class='title']/span[@class='icon-sale']/a[@class='shopbranch']/em/text()", 'shop_id': "./@data-shopid", 'star': "./div[@class='info baby-info']/p[@class='remark']/span[contains(@class, 'item-rank-rst')]/@title", 'review_numbers': "./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='comment-count']/a/text()", 'mean_prices': "string(./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='average'])", 'group_deal': "./div[@class='info baby-info']/div[@class='tuan-info']/a[@class='tuan']/@title", } need_clean = [ 'mean_prices', ] elif channel in [ 'ch90', ]: # ch90家装频道是201905以后增加的新频道,目前完全没有字符串加密。直接读取中文和数字即可 this_page_xpath = { 'title': "./div[@class='info baby-info']/p[@class='title']/a[@class='shopname']/text()", 'branch': "./div[@class='info baby-info']/p[@class='title']/span[@class='icon-sale']/a[@class='shopbranch']/em/text()", 'shop_id': "./@data-shopid", 'star': "./div[@class='info baby-info']/p[@class='remark']/span[contains(@class, 'item-rank-rst')]/@title", 'review_numbers': "./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='comment-count']/a/text()", 'mean_prices': "string(./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='average'])", 'group_deal': "./div[@class='info baby-info']/div[@class='tuan-info']/a[@class='tuan']/@title", } need_clean = [ 'mean_prices', ] elif channel in ['hotel']: use_extract = ['hotel_tags'] need_clean = [ 'place', 'price', ] this_page_xpath = { 'shop_id': "./@data-poi", 'title': "./div[@class='hotel-info-ctn']/div[@class='hotel-info-main']/h2[@class='hotel-name']/a/text()", 'place': "string(./div[@class='hotel-info-ctn']/div[@class='hotel-info-main']/p[@class='place'])", 'hotel_tags': "./div[@class='hotel-info-ctn']/div[@class='hotel-info-main']/p[@class='hotel-tags']/span/text()", 'price': "string(./div[@class='hotel-info-ctn']/div[@class='hotel-remark']/div[@class='price']/p)", 'star': "./div[@class='hotel-info-ctn']/div[@class='hotel-remark']/div[@class='remark']/div[@class='item-rank-ctn']/div[@class='item-rank-ctn']/span/@class", 'review_numbers': "./div[@class='hotel-info-ctn']/div[@class='hotel-remark']/div[@class='remark']/div[@class='item-rank-ctn']/div[@class='item-rank-ctn']/a/text()", } if one_li is not None: for index, key in enumerate(this_page_xpath): if key in use_extract: temp_list = one_li.xpath(this_page_xpath[key]).extract() this_page_dict[ key] = CommonClass.get_cleaned_string_by_splitting_list( string_or_list=temp_list, char_to_remove=[ '\r', '\n', '\t', ' ', ]) elif key in need_clean: temp_str = one_li.xpath( this_page_xpath[key]).extract_first(default="") this_page_dict[key] = CommonClass.clean_string( string=temp_str, char_to_remove=[ '\r', '\n', '\t', ' ', ]) elif key in need_split_and_clean: temp_string = one_li.xpath( this_page_xpath[key]).extract_first(default="") this_page_dict[ key] = CommonClass.get_cleaned_string_by_splitting_list( string_or_list=temp_string, char_to_remove=[ '\r', '\n', '\t', ' ', ]) else: this_page_dict[key] = one_li.xpath( this_page_xpath[key]).extract_first(default="") # special fields if channel in ['hotel']: if 'star' in this_page_dict.keys(): temp = this_page_dict['star'].replace( "sml-rank-stars sml-str", "") if re.match(r'^(\d)+$', temp): temp = int(temp) if temp in self.database_merchant_star_level_table.keys( ): this_page_dict[ 'star'] = self.database_merchant_star_level_table[ temp] else: this_page_dict['star'] = this_page_dict[ 'star'].replace("sml-rank-stars sml-str", "") else: this_page_dict['star'] = temp if 'review_numbers' in this_page_dict.keys(): this_page_dict['review_numbers'] = this_page_dict[ 'review_numbers'].replace("(", "") this_page_dict['review_numbers'] = this_page_dict[ 'review_numbers'].replace(")", "") shop_id = this_page_dict[ 'shop_id'] if 'shop_id' in this_page_dict.keys() else '0' # extract special nodes # no by now return this_page_dict, shop_id