示例#1
0
class YoutubeItem(scrapy.Item):
    id = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    name = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    videos = scrapy.Field(
        output_processor=Identity()
    )
    partners = scrapy.Field(
        output_processor=Identity()
    )
    subscribers = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    is_verified = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )

    pass
示例#2
0
class BcDailyPostLoader(ItemLoader):
    default_item_class = BcDailyPost
    default_input_processor = MapCompose(strip_html5_whitespace)
    default_output_processor = Compose(TakeFirst())

    title_out = Compose(TakeFirst(), normalize)
    tags_out = Identity()
    to_dl_out = Identity()
示例#3
0
class CompanyLoader(ItemLoader):
    default_input_processor = MapCompose(lambda x: x.strip().replace('\n', ''))
    default_output_processor = TakeFirst()

    categories_out = Identity()
    postal_code_out = Compose(TakeFirst(), int)
    phone_in = MapCompose(lambda x: re.sub(u' |\xa0', '', x))
示例#4
0
class YoutubeVideoItem(scrapy.Item):
    url = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    category = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    date = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    title = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    views = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    likes = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    dislikes = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    comments = scrapy.Field(
        output_processor=Identity()
    )
    pass
示例#5
0
class ScraperProductLoader(ItemLoader):
    """
    Creates items via XPath or CSS expressions.

    Basically, reduces the amount of work involved in scraping items because
    the item loader can take an XPath or CSS expression and immediately load
    that into the item (or add multiple values, if the exist).

    As well, the item loader can handle custom input / output processing for
    common operations.

    More details available in the docs:
        http://doc.scrapy.org/en/latest/topics/loaders.html
    """
    default_output_processor = TakeFirst()

    name_in = Compose(TakeFirst(), unicode.strip)

    description_in = Compose(Join(), unicode.strip, sanitize_html)

    details_in = Compose(Join(), sanitize_html)

    attributes_out = Compose(DefaultValue(lambda: {}), MergeDicts())

    image_urls_out = Identity()
示例#6
0
    def parse_item(self, response):
        print('  >>>> %s' % response.url)

        loader = ItemLoader(item=MeizituItem(), response=response)
        loader.add_xpath('name', '//h2/a/text()')
        loader.add_xpath('img_urls', '//div/p/img/@src', Identity())
        return loader.load_item()
示例#7
0
    def parse_item(self, response):
        print("parse_item ")
        l = ItemLoader(item=YouwuItem(), response=response)
        l.add_xpath('image_urls', "//img[@id='bigimg']/@src", Identity())
        l.add_value('url', response.url)
        l.add_xpath('text', "//div[@id='photos']/h1/text()")

        return l.load_item()
示例#8
0
    def parse_item(self, response):
        print("parse_item ")
        l = ItemLoader(item=Mm131Item(), response=response)
        l.add_xpath('image_urls', "//div[@class='content-pic']/a/img/@src", Identity())
        l.add_value('url', response.url)
        l.add_xpath('text', "//div[@class='content']/h5/text()")

        return l.load_item()
示例#9
0
    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        l.add_xpath('name', '//h2/a/text()')
        #l.add_xpath('tag', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        l.add_xpath('image_url', "//div[@id='picture']/p/img/@src", Identity())

        l.add_value('url', response.url)
        return l.load_item()
示例#10
0
class YoutubeFeedItem(scrapy.Item):
    id = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    channels = scrapy.Field(
        output_processor=Identity()
    )
    pass
示例#11
0
    def parse_item(self, response):
        print("parse_item ")
        l = ItemLoader(item=DaiqiyangItem(), response=response)
        l.add_xpath('image_urls', "//div[@class='showimg']/a/img/@src",
                    Identity())
        l.add_value('url', response.url)
        l.add_xpath('text', "//div[@class='crumbs']/h1/text()")

        return l.load_item()
示例#12
0
    def parse_item(self, response):
        print("parse_item ")
        l = ItemLoader(item=Www7160Item(), response=response)
        l.add_xpath('image_urls',
                    "//div[@class='picsbox picsboxcenter']/p/a/img/@src",
                    Identity())
        l.add_value('url', response.url)
        l.add_xpath('text', "//div[@id='photos']/h1/text()")

        return l.load_item()
示例#13
0
 def parse_item(self, response):
     # sel2 = Selector(response)
     # link = sel2.xpath("//img/@src").extract()
     # image_item = ImageItem()
     # image_item['image_urls'] = link
     # yield image_item
     l = ItemLoader(item=ImageItem(), response=response)
     l.add_xpath('image_urls', "//input[@type='image']/@src", Identity())
     filePath = response.meta['item']
     l.add_value('url', response.url)
     l.add_value('filePath', filePath)
     return l.load_item()
示例#14
0
 def parse_item(self, response):
     l = ItemLoader(item=YoumeituItem(), response=response)
     l.add_xpath('name', '//h2/a/text()')
     l.add_xpath(
         'tags',
         "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p"
     )
     l.add_xpath(
         'image_urls',
         "//div[@id='mainbox']/div[@id='canvasbox']/div[@id='content']/a[@id='item-tip']/img/@src",
         Identity())
     l.add_value('url', response.url)
     return l.load_item()
示例#15
0
class URLListLoader(ItemLoader):
    default_output_processor = Identity()

    def __init__(self,
                 item=None,
                 selector=None,
                 response=None,
                 parent=None,
                 **context):
        super().__init__(item, selector, response, parent, **context)

        # starcity item
        self.add_css(
            "urls", "div.listItem-details > h4.listItem-title > a::attr(href)")
        # gatherer item
        self.add_css("urls", "tr.cardItem > td > a::attr(href)")
示例#16
0
class ArticleLoader(ItemLoader):
    default_input_processor = MapCompose(remove_tags, str.strip)
    default_output_processor = TakeFirst()
    url_in = Identity()