示例#1
0
class LianJiaItemLoader(ItemLoader):
    #自定义itemloader
    default_output_processor = TakeFirst()
示例#2
0
class RealtorListingFix1JulienLoader(ItemLoader):
    default_input_processor = MapCompose(remove_tags, str.strip)
    default_output_processor = TakeFirst()
示例#3
0
class BrokerLoader(ItemLoader):
    default_input_processor = MapCompose(remove_tags, str.strip)
    default_output_processor = TakeFirst()

    brokerMobile_in = MapCompose(serialize_number)
    officePhone_in = MapCompose(serialize_number)
示例#4
0
class LagouLoader(ItemLoader):
    default_output_processor = TakeFirst()
示例#5
0
class Century21OfficeLoader(ItemLoader):
    default_input_processor = MapCompose(remove_tags, str.strip)
    default_output_processor = TakeFirst()

    officePhone_in = MapCompose(serialize_number)
    officeAddress_out = Join(', ')
示例#6
0
class LagouJobItemLoader(ItemLoader):
    # 自定义ItemLoader
    default_output_processor = TakeFirst()
示例#7
0
class JobBoleArticleItem(scrapy.Item):
    title = scrapy.Field(
        input_processor=MapCompose(lambda x: x + "-jobbole", add_jobbole)
    )
    create_date = scrapy.Field(
        input_processor=MapCompose(date_convert),
        output_processor=TakeFirst()
    )
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    front_image_url = scrapy.Field(
        output_processor=MapCompose(return_value)
    )
    front_image_path = scrapy.Field()
    praise_nums = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    comment_nums = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    fav_nums = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    tags = scrapy.Field(
        input_processor=MapCompose(remove_comment_tags),
        output_processor=Join(",")
    )
    content = scrapy.Field()

    def get_insert_sql(self):
        insert_sql = """
            insert into jobbole_article(title, url, create_date, fav_nums, front_image_url, front_image_path,
            parise_nums, comment_nums, tags, content)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE fav_nums=VALUES(fav_nums),
            front_image_url=VALUES(front_image_url), front_image_path(front_image_path), content=VALUES(content),
            parise_nums=VALUES(praise_nums), comment_nums=VALUES(comment_nums), tags=VALUES(tags)
        """
        params = (self["title"], self["url"], self["create_date"], self["fav_nums"])

        return insert_sql, params

    def save_to_es(self):
        article = ArticleType()
        article.title = self['title']
        article.create_date = self['create_date']
        article.content = remove_tags(self['content'])
        article.front_image_url = self['front_image_url']
        if 'front_image_path' in self:
            article.front_image_path = self['front_image_path']
        article.praise_nums = self['praise_nums']
        article.fav_nums = self['fav_nums']
        article.comment_nums = self['comment_nums']
        article.url = self['url']
        article.tags = self['tags']
        article.meta.id = self['url_object_id']

        article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7)))

        article.save()

        redis_cli.incr("jobbole_count")

        return
class BooksToScrapeItem(scrapy.Item):
    image_urls = scrapy.Field()
    images = scrapy.Field()
    book_name = scrapy.Field(
        output_processor = TakeFirst()
    )
示例#9
0
class StockstarItemLoader(ItemLoader):
    #自定义itemloader,用于存储爬虫所抓取的字段内容
    default_output_processor = TakeFirst()
    pass
示例#10
0
class Product(scrapy.Item):
    name = scrapy.Field()
    price = scrapy.Field(
        input_processor=MapCompose(remove_tags, filter_price),
        output_processor=TakeFirst(),
    )
示例#11
0
class ArticleItemLoader(ItemLoader):  #自定义itemloader
    default_output_processor = TakeFirst()  #这样就不用每个都写outputoutput_processor = TakeFirst()
示例#12
0
class CommonItemLoader(ItemLoader):
    default_output_processor = TakeFirst()
    pass
class Producto(Item):
    imagen = Field(output_processor=TakeFirst())
    id = Field(output_processor=TakeFirst())
    titulo = Field(output_processor=TakeFirst())
    precio = Field(output_processor=TakeFirst())
    precio_a = Field(output_processor=TakeFirst())
    precio_b = Field(output_processor=TakeFirst())
    precio_previo = Field(output_processor=TakeFirst())
    reduccion = Field(output_processor=TakeFirst())
    marca = Field(output_processor=TakeFirst())
    url = Field(output_processor=TakeFirst())
    rating = Field(output_processor=TakeFirst())
    review = Field(output_processor=TakeFirst())
    modelId = Field(output_processor=TakeFirst())
    control_type = Field(output_processor=TakeFirst())
示例#14
0
class JobboleItemLoader(ItemLoader):
    #自定义itemloader
    default_output_processor = TakeFirst()
示例#15
0
class FacebookProfileLoader(ItemLoader):
    default_output_processor = TakeFirst()

    fbid_in = MapCompose(to_int)
示例#16
0
class ReviewItem(scrapy.Item):
    product_id = scrapy.Field(output_processor=TakeFirst())
    product_url = scrapy.Field(output_processor=TakeFirst())
    review_text = scrapy.Field(output_processor=TakeFirst())
    review_id = scrapy.Field(output_processor=TakeFirst())
    guid = scrapy.Field(output_processor=TakeFirst())
    images_count = scrapy.Field(output_processor=TakeFirst())
    creation_time = scrapy.Field(output_processor=TakeFirst())
    images = scrapy.Field(input_processor=to_str)
    reviewer = scrapy.Field(output_processor=TakeFirst())
    product_color = scrapy.Field(output_processor=TakeFirst())
    product_sales = scrapy.Field(output_processor=TakeFirst())
    product_size = scrapy.Field(output_processor=TakeFirst())
    videos = scrapy.Field(input_processor=to_str)
    reply_count = scrapy.Field(output_processor=TakeFirst())
    reply_count_2 = scrapy.Field(output_processor=TakeFirst())
    review_score = scrapy.Field(output_processor=TakeFirst())
    useful_vote_count = scrapy.Field(output_processor=TakeFirst())
示例#17
0
class ExampleLoader(ItemLoader):
    default_item_class = ExampleItem
    default_input_processor = MapCompose(lambda s: s.strip())
    default_output_processor = TakeFirst()
    description_out = Join()
示例#18
0
class NewsItem(scrapy.Item):
   	
    news_title = scrapy.Field(
    	input_processor = MapCompose(remove_tags),
    	output_processor = TakeFirst()
    )
示例#19
0
class ArticleItemLoader(ItemLoader):
    # 自定义itemLoader
    default_input_processor = TakeFirst()
示例#20
0
class AppstoreCrawlerItemLoader_cn(ItemLoader):
    #自定义itemloader
    default_output_processor = TakeFirst()
示例#21
0
class NewsLoader(ItemLoader):
    # 相当于extract_fisrt()
    default_output_processor = TakeFirst()
示例#22
0
class SpiderItemLoader(ItemLoader):
    FIELDS = [
        'shop_code', 'shop_title', 'shop_url', 'shop_price',
        'shop_availability'
    ]

    # Default processors
    default_output_processor = TakeFirst()

    def __init__(self,
                 item=None,
                 selector=None,
                 response=None,
                 parent=None,
                 **context):
        super(SpiderItemLoader, self).__init__(item, selector, response,
                                               parent, **context)
        self.db_spider = self.context.get('db_spider')

        # Setup processors
        for field in self.FIELDS:

            # Input
            input_field = str(getattr(self.db_spider, field + '_in'))
            if input_field:
                setattr(self, field + '_in', eval(input_field))

            # Output
            output_field = str(getattr(self.db_spider, field + '_out'))
            if output_field:
                setattr(self, field + '_in', eval(output_field))

    def add_xpaths(self):
        """
        Method for populating item fields from xpaths
        """
        for field in self.FIELDS:
            field_type = getattr(self.db_spider, field + '_type')
            if field_type == 0:
                xpath = str(getattr(self.db_spider, field))
                if xpath:
                    self.add_xpath(field, xpath)

    def add_detail_xpaths(self):
        """
        Method for populating item fields from xpaths after following the item page
        """
        for field in self.FIELDS:
            field_type = getattr(self.db_spider, field + '_type')
            if field_type == 1:
                xpath = str(getattr(self.db_spider, field))
                if xpath:
                    self.add_xpath(field, xpath)

    def add_values(self):
        """
        Method for populating item fields from values
        """
        for field in self.FIELDS:
            field_type = getattr(self.db_spider, field + '_type')
            if field_type == 2:
                value = str(getattr(self.db_spider, field))
                if value:
                    self.add_value(field, value)
示例#23
0
文件: items.py 项目: wwwxmu/dianping
class ReviewsItemLoader(ItemLoader):
    default_output_processor = TakeFirst()
示例#24
0
class qimaiItemLoader(ItemLoader):
    #自定义itemloader
    default_output_processor = TakeFirst()
示例#25
0
class AttorneyLoader(ItemLoader):
    default_input_processor = MapCompose(remove_tags, str.strip)
    default_output_processor = TakeFirst()

    phone_in = MapCompose(serialize_number)
示例#26
0
class CoordinateItem(Item):
    lat = Field(input_processor=MapCompose(float), output_processor=TakeFirst())
    lon = Field(input_processor=MapCompose(float), output_processor=TakeFirst())
示例#27
0
class RealtorListingExtraJulienLoader(ItemLoader):
    default_input_processor = MapCompose(remove_tags, str.strip)
    default_output_processor = TakeFirst()

    listing_in = Identity()
示例#28
0
class ParamItem(Item):
    key = Field(input_processor=MapCompose(strip), output_processor=TakeFirst())
    value = Field(input_processor=MapCompose(strip), output_processor=TakeFirst())
示例#29
0
class ArticleItemLoader(ItemLoader):
    default_output_processor = TakeFirst()
示例#30
0
class RatingLoader(ItemLoader):
    """ loader for RatingItem """

    default_input_processor = MapCompose(identity, str, remove_tags,
                                         replace_all_entities, normalize_space)
    default_output_processor = TakeFirst()