def process_item(self, item, spider): """ revision: 20190730 """ self.init_self_attributes(spider) page_type = "" for index, one in enumerate(item): if "page_type" == one and 1 == len(item["page_type"]): page_type = str(item["page_type"][0]) excluded_list = [ "page_type", ] key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys( item=item, excluded_key_list=excluded_list) index = -1 content_dict = {} if "content" in key_list1 and "detailed" == page_type: index = key_list1.index("content") if -1 < index and index < len(item_list1): content_dict = eval(item_list1[index]) item_list1.remove(item_list1[index]) key_list1.remove("content") keys = [] items = [] for key, value in content_dict.items(): keys.append(key) items.append(value) key_list = keys + key_list1 item_list = items + item_list1 CommonScrapyPipelineClass.append_row( spider_obj=spider, key_list=key_list, item_list=item_list, csv_file_path_str=self.csv_file_path) if self.to_kafka and socket.gethostname( ) in self.cluster_servers_for_spiders: CommonScrapyPipelineClass.pipeline_to_kafka( spider_obj=spider, key_list=key_list, item_list=item_list, kafka_topic_str=self.kafka_topic, kafka_producer_obj=self.kafka_producer) elif "detailed" == page_type: error_msg = f"no content in key_list1 ({key_list1})" spider.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return item
def process_item(self, item, spider): """ there are so many lat, and lng for one bus route (one item), therefore we do not request amap here. """ self.init_self_attributes(spider=spider) page_type = "" for index, one in enumerate(item): if "page_type" == one and 1 == len(item["page_type"]): page_type = str(item["page_type"][0]) break excluded_list = [ "page_type", ] key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys( item=item, excluded_key_list=excluded_list) if "detailed" == page_type: result_bool, key_list, item_list = CommonScrapyPipelineClass.extract_items_and_keys_from_content( raw_key_list=key_list1, raw_item_list=item_list1, content_field_name_str="content") if result_bool: CommonScrapyPipelineClass.append_row( spider_obj=spider, key_list=key_list, item_list=item_list, csv_file_path_str=self.csv_file_path) if self.to_kafka and socket.gethostname( ) in self.cluster_servers_for_spiders: CommonScrapyPipelineClass.pipeline_to_kafka( spider_obj=spider, key_list=key_list, item_list=item_list, kafka_topic_str=self.kafka_topic, kafka_producer_obj=self.kafka_producer) else: spider.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, no content in key_list1 ({key_list1})" ) return item
def process_item(self, item, spider): self.init_self_attributes(spider=spider) random_key = random.randint(0, len(self.key_list) - 1) account = self.key_list[random_key] page_type = "" for index, one in enumerate(item): if "page_type" == one and 1 == len(item["page_type"]): page_type = str(item["page_type"][0]) excluded_list = [ "page_type", ] all_keys1, item_list1 = self.get_items_and_keys( item=item, excluded_list=excluded_list) index = -1 content_dict = {} if "content" in all_keys1 and "detailed" == page_type: index = all_keys1.index("content") if -1 < index and index < len(item_list1): content_dict = eval(item_list1[index]) item_list1.remove(item_list1[index]) all_keys1.remove("content") content_dict["longitude"] = np.nan content_dict["latitude"] = np.nan content_dict["adcode"] = np.nan # request Gaode here if isinstance(item["url"], list): temp_list = str(item["url"][0]).replace("https://", "") elif isinstance(item["url"], str): temp_list = item["url"].replace("https://", "") temp_list = temp_list.split(".") city_name = temp_list[0] if 0 < len(temp_list) and 0 < len( temp_list[0]) else "" if 0 < len(city_name): city_name = self.check_city_name(city_name) three_requests_for_tryout = [ "location", "address", ] for one_tryout in three_requests_for_tryout: if one_tryout in content_dict.keys(): result_dict = {} params = { "key": account, "address": str(self.clean_addr(content_dict[one_tryout])), "city": city_name, } response = requests.get(self.base_gaode_url, headers=self.headers, params=params) if 200 == response.status_code: if self.save_every_response is not None and self.save_every_response: self.save_reponsed_json_file( rent_id=content_dict["rent_id"], response=response.text) result_dict = self.parse_gaode_json( response.text) if 0 < (result_dict["count"]): content_dict["longitude"] = result_dict[ "longitude"] content_dict["latitude"] = result_dict[ "latitude"] content_dict["adcode"] = result_dict[ "adcode"] break keys = [] items = [] for key, value in content_dict.items(): keys.append(key) items.append(value) key_list = keys + all_keys1 item_list = items + item_list1 CommonScrapyPipelineClass.append_row( spider_obj=spider, key_list=key_list, item_list=item_list, csv_file_path_str=self.csv_file_path) if self.to_kafka and socket.gethostname( ) in self.cluster_servers_for_spiders: CommonScrapyPipelineClass.pipeline_to_kafka( spider_obj=spider, key_list=key_list, item_list=item_list, kafka_topic_str=self.kafka_topic, kafka_producer_obj=self.kafka_producer) elif "detailed" == page_type: spider.logger.error( f"no content in all_keys1 ({all_keys1}) in Method process_item of Class FangPipeline. Exception = {ex}" ) return item
def process_item(self, item, spider): """ todo: some parts of this method can be moved to commonfunctions.py """ self.init_self_attributes(spider) random_key = random.randint(0, len(self.key_list) - 1) account = self.key_list[random_key] page_type = "" for index, one in enumerate(item): if "page_type" == one and 1 == len(item["page_type"]): page_type = str(item["page_type"][0]) excluded_list = [ "page_type", ] key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys( item=item, excluded_key_list=excluded_list) index = -1 content_dict = {} if "content" in key_list1 and "detailed" == page_type: index = key_list1.index("content") if -1 < index and index < len(item_list1): content_dict = eval(item_list1[index]) item_list1.remove(item_list1[index]) key_list1.remove("content") content_dict["longitude"] = np.nan content_dict["latitude"] = np.nan content_dict["adcode"] = np.nan # request Gaode here if isinstance(item["url"], list): url_str = str(item["url"][0]) elif isinstance(item["url"], str): url_str = item["url"] city_name_fang = self.get_city_or_district_name_from_url( url=url_str) if 0 < len(city_name_fang): city_name_amap = self.switch_city_name( city_name=city_name_fang, spider=spider) community_name = self.extract_community_name( content_dict=content_dict) spider.logger.info( f"requesting Gaode using community name {community_name}" ) if 0 < len(community_name): result_dict = {} params = { "key": account, "address": str( CommonScrapyPipelineClass.clean_addr( text=community_name)), "city": city_name_amap, } try: # 20190621发现爬取佛山的时候因为DNS解析失败而丢失了14条记录。这里增加代码,记录再次丢失。 # socket.gaierror: [Errno -3] Temporary failure in name resolution response = requests.get(self.base_gaode_url, headers=self.headers, params=params) if 200 == response.status_code: if self.save_every_response is not None and self.save_every_response: self.save_reponsed_json_file( apt_id=content_dict["apt_id"], response=response.text, spider=spider) result_dict = CommonScrapyPipelineClass.parse_gaode_json( json_text=response.text) if 0 < (result_dict["count"]): content_dict["longitude"] = result_dict[ "longitude"] content_dict["latitude"] = result_dict[ "latitude"] content_dict["adcode"] = result_dict[ "adcode"] except Exception as ex: spider.logger.error( f"requests or other errors. Exception = {ex}") keys = [] items = [] for key, value in content_dict.items(): keys.append(key) items.append(value) key_list = keys + key_list1 item_list = items + item_list1 CommonScrapyPipelineClass.append_row( spider_obj=spider, key_list=key_list, item_list=item_list, csv_file_path_str=self.csv_file_path) if self.to_kafka and socket.gethostname( ) in self.cluster_servers_for_spiders: CommonScrapyPipelineClass.pipeline_to_kafka( spider_obj=spider, key_list=key_list, item_list=item_list, kafka_topic_str=self.kafka_topic, kafka_producer_obj=self.kafka_producer) elif "detailed" == page_type: error_msg = f"no content in key_list1 ({key_list1})" spider.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return item