class LianJiaItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst()
class RealtorListingFix1JulienLoader(ItemLoader): default_input_processor = MapCompose(remove_tags, str.strip) default_output_processor = TakeFirst()
class BrokerLoader(ItemLoader): default_input_processor = MapCompose(remove_tags, str.strip) default_output_processor = TakeFirst() brokerMobile_in = MapCompose(serialize_number) officePhone_in = MapCompose(serialize_number)
class LagouLoader(ItemLoader): default_output_processor = TakeFirst()
class Century21OfficeLoader(ItemLoader): default_input_processor = MapCompose(remove_tags, str.strip) default_output_processor = TakeFirst() officePhone_in = MapCompose(serialize_number) officeAddress_out = Join(', ')
class LagouJobItemLoader(ItemLoader): # 自定义ItemLoader default_output_processor = TakeFirst()
class JobBoleArticleItem(scrapy.Item): title = scrapy.Field( input_processor=MapCompose(lambda x: x + "-jobbole", add_jobbole) ) create_date = scrapy.Field( input_processor=MapCompose(date_convert), output_processor=TakeFirst() ) url = scrapy.Field() url_object_id = scrapy.Field() front_image_url = scrapy.Field( output_processor=MapCompose(return_value) ) front_image_path = scrapy.Field() praise_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) comment_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) fav_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) tags = scrapy.Field( input_processor=MapCompose(remove_comment_tags), output_processor=Join(",") ) content = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into jobbole_article(title, url, create_date, fav_nums, front_image_url, front_image_path, parise_nums, comment_nums, tags, content) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE fav_nums=VALUES(fav_nums), front_image_url=VALUES(front_image_url), front_image_path(front_image_path), content=VALUES(content), parise_nums=VALUES(praise_nums), comment_nums=VALUES(comment_nums), tags=VALUES(tags) """ params = (self["title"], self["url"], self["create_date"], self["fav_nums"]) return insert_sql, params def save_to_es(self): article = ArticleType() article.title = self['title'] article.create_date = self['create_date'] article.content = remove_tags(self['content']) article.front_image_url = self['front_image_url'] if 'front_image_path' in self: article.front_image_path = self['front_image_path'] article.praise_nums = self['praise_nums'] article.fav_nums = self['fav_nums'] article.comment_nums = self['comment_nums'] article.url = self['url'] article.tags = self['tags'] article.meta.id = self['url_object_id'] article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.save() redis_cli.incr("jobbole_count") return
class BooksToScrapeItem(scrapy.Item): image_urls = scrapy.Field() images = scrapy.Field() book_name = scrapy.Field( output_processor = TakeFirst() )
class StockstarItemLoader(ItemLoader): #自定义itemloader,用于存储爬虫所抓取的字段内容 default_output_processor = TakeFirst() pass
class Product(scrapy.Item): name = scrapy.Field() price = scrapy.Field( input_processor=MapCompose(remove_tags, filter_price), output_processor=TakeFirst(), )
class ArticleItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst() #这样就不用每个都写outputoutput_processor = TakeFirst()
class CommonItemLoader(ItemLoader): default_output_processor = TakeFirst() pass
class Producto(Item): imagen = Field(output_processor=TakeFirst()) id = Field(output_processor=TakeFirst()) titulo = Field(output_processor=TakeFirst()) precio = Field(output_processor=TakeFirst()) precio_a = Field(output_processor=TakeFirst()) precio_b = Field(output_processor=TakeFirst()) precio_previo = Field(output_processor=TakeFirst()) reduccion = Field(output_processor=TakeFirst()) marca = Field(output_processor=TakeFirst()) url = Field(output_processor=TakeFirst()) rating = Field(output_processor=TakeFirst()) review = Field(output_processor=TakeFirst()) modelId = Field(output_processor=TakeFirst()) control_type = Field(output_processor=TakeFirst())
class JobboleItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst()
class FacebookProfileLoader(ItemLoader): default_output_processor = TakeFirst() fbid_in = MapCompose(to_int)
class ReviewItem(scrapy.Item): product_id = scrapy.Field(output_processor=TakeFirst()) product_url = scrapy.Field(output_processor=TakeFirst()) review_text = scrapy.Field(output_processor=TakeFirst()) review_id = scrapy.Field(output_processor=TakeFirst()) guid = scrapy.Field(output_processor=TakeFirst()) images_count = scrapy.Field(output_processor=TakeFirst()) creation_time = scrapy.Field(output_processor=TakeFirst()) images = scrapy.Field(input_processor=to_str) reviewer = scrapy.Field(output_processor=TakeFirst()) product_color = scrapy.Field(output_processor=TakeFirst()) product_sales = scrapy.Field(output_processor=TakeFirst()) product_size = scrapy.Field(output_processor=TakeFirst()) videos = scrapy.Field(input_processor=to_str) reply_count = scrapy.Field(output_processor=TakeFirst()) reply_count_2 = scrapy.Field(output_processor=TakeFirst()) review_score = scrapy.Field(output_processor=TakeFirst()) useful_vote_count = scrapy.Field(output_processor=TakeFirst())
class ExampleLoader(ItemLoader): default_item_class = ExampleItem default_input_processor = MapCompose(lambda s: s.strip()) default_output_processor = TakeFirst() description_out = Join()
class NewsItem(scrapy.Item): news_title = scrapy.Field( input_processor = MapCompose(remove_tags), output_processor = TakeFirst() )
class ArticleItemLoader(ItemLoader): # 自定义itemLoader default_input_processor = TakeFirst()
class AppstoreCrawlerItemLoader_cn(ItemLoader): #自定义itemloader default_output_processor = TakeFirst()
class NewsLoader(ItemLoader): # 相当于extract_fisrt() default_output_processor = TakeFirst()
class SpiderItemLoader(ItemLoader): FIELDS = [ 'shop_code', 'shop_title', 'shop_url', 'shop_price', 'shop_availability' ] # Default processors default_output_processor = TakeFirst() def __init__(self, item=None, selector=None, response=None, parent=None, **context): super(SpiderItemLoader, self).__init__(item, selector, response, parent, **context) self.db_spider = self.context.get('db_spider') # Setup processors for field in self.FIELDS: # Input input_field = str(getattr(self.db_spider, field + '_in')) if input_field: setattr(self, field + '_in', eval(input_field)) # Output output_field = str(getattr(self.db_spider, field + '_out')) if output_field: setattr(self, field + '_in', eval(output_field)) def add_xpaths(self): """ Method for populating item fields from xpaths """ for field in self.FIELDS: field_type = getattr(self.db_spider, field + '_type') if field_type == 0: xpath = str(getattr(self.db_spider, field)) if xpath: self.add_xpath(field, xpath) def add_detail_xpaths(self): """ Method for populating item fields from xpaths after following the item page """ for field in self.FIELDS: field_type = getattr(self.db_spider, field + '_type') if field_type == 1: xpath = str(getattr(self.db_spider, field)) if xpath: self.add_xpath(field, xpath) def add_values(self): """ Method for populating item fields from values """ for field in self.FIELDS: field_type = getattr(self.db_spider, field + '_type') if field_type == 2: value = str(getattr(self.db_spider, field)) if value: self.add_value(field, value)
class ReviewsItemLoader(ItemLoader): default_output_processor = TakeFirst()
class qimaiItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst()
class AttorneyLoader(ItemLoader): default_input_processor = MapCompose(remove_tags, str.strip) default_output_processor = TakeFirst() phone_in = MapCompose(serialize_number)
class CoordinateItem(Item): lat = Field(input_processor=MapCompose(float), output_processor=TakeFirst()) lon = Field(input_processor=MapCompose(float), output_processor=TakeFirst())
class RealtorListingExtraJulienLoader(ItemLoader): default_input_processor = MapCompose(remove_tags, str.strip) default_output_processor = TakeFirst() listing_in = Identity()
class ParamItem(Item): key = Field(input_processor=MapCompose(strip), output_processor=TakeFirst()) value = Field(input_processor=MapCompose(strip), output_processor=TakeFirst())
class ArticleItemLoader(ItemLoader): default_output_processor = TakeFirst()
class RatingLoader(ItemLoader): """ loader for RatingItem """ default_input_processor = MapCompose(identity, str, remove_tags, replace_all_entities, normalize_space) default_output_processor = TakeFirst()