Пример #1
0
class YoutubeItem(scrapy.Item):
    id = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    name = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    videos = scrapy.Field(
        output_processor=Identity()
    )
    partners = scrapy.Field(
        output_processor=Identity()
    )
    subscribers = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    is_verified = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )

    pass
Пример #2
0
class BcDailyPostLoader(ItemLoader):
    default_item_class = BcDailyPost
    default_input_processor = MapCompose(strip_html5_whitespace)
    default_output_processor = Compose(TakeFirst())

    title_out = Compose(TakeFirst(), normalize)
    tags_out = Identity()
    to_dl_out = Identity()
Пример #3
0
class CompanyLoader(ItemLoader):
    default_input_processor = MapCompose(lambda x: x.strip().replace('\n', ''))
    default_output_processor = TakeFirst()

    categories_out = Identity()
    postal_code_out = Compose(TakeFirst(), int)
    phone_in = MapCompose(lambda x: re.sub(u' |\xa0', '', x))
Пример #4
0
class YoutubeVideoItem(scrapy.Item):
    url = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    category = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    date = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    title = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    views = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    likes = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    dislikes = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    comments = scrapy.Field(
        output_processor=Identity()
    )
    pass
Пример #5
0
class ScraperProductLoader(ItemLoader):
    """
    Creates items via XPath or CSS expressions.

    Basically, reduces the amount of work involved in scraping items because
    the item loader can take an XPath or CSS expression and immediately load
    that into the item (or add multiple values, if the exist).

    As well, the item loader can handle custom input / output processing for
    common operations.

    More details available in the docs:
        http://doc.scrapy.org/en/latest/topics/loaders.html
    """
    default_output_processor = TakeFirst()

    name_in = Compose(TakeFirst(), unicode.strip)

    description_in = Compose(Join(), unicode.strip, sanitize_html)

    details_in = Compose(Join(), sanitize_html)

    attributes_out = Compose(DefaultValue(lambda: {}), MergeDicts())

    image_urls_out = Identity()
Пример #6
0
    def parse_item(self, response):
        print('  >>>> %s' % response.url)

        loader = ItemLoader(item=MeizituItem(), response=response)
        loader.add_xpath('name', '//h2/a/text()')
        loader.add_xpath('img_urls', '//div/p/img/@src', Identity())
        return loader.load_item()
Пример #7
0
    def parse_item(self, response):
        print("parse_item ")
        l = ItemLoader(item=YouwuItem(), response=response)
        l.add_xpath('image_urls', "//img[@id='bigimg']/@src", Identity())
        l.add_value('url', response.url)
        l.add_xpath('text', "//div[@id='photos']/h1/text()")

        return l.load_item()
Пример #8
0
    def parse_item(self, response):
        print("parse_item ")
        l = ItemLoader(item=Mm131Item(), response=response)
        l.add_xpath('image_urls', "//div[@class='content-pic']/a/img/@src", Identity())
        l.add_value('url', response.url)
        l.add_xpath('text', "//div[@class='content']/h5/text()")

        return l.load_item()
Пример #9
0
    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        l.add_xpath('name', '//h2/a/text()')
        #l.add_xpath('tag', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        l.add_xpath('image_url', "//div[@id='picture']/p/img/@src", Identity())

        l.add_value('url', response.url)
        return l.load_item()
Пример #10
0
class YoutubeFeedItem(scrapy.Item):
    id = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    channels = scrapy.Field(
        output_processor=Identity()
    )
    pass
Пример #11
0
    def parse_item(self, response):
        print("parse_item ")
        l = ItemLoader(item=DaiqiyangItem(), response=response)
        l.add_xpath('image_urls', "//div[@class='showimg']/a/img/@src",
                    Identity())
        l.add_value('url', response.url)
        l.add_xpath('text', "//div[@class='crumbs']/h1/text()")

        return l.load_item()
Пример #12
0
    def parse_item(self, response):
        print("parse_item ")
        l = ItemLoader(item=Www7160Item(), response=response)
        l.add_xpath('image_urls',
                    "//div[@class='picsbox picsboxcenter']/p/a/img/@src",
                    Identity())
        l.add_value('url', response.url)
        l.add_xpath('text', "//div[@id='photos']/h1/text()")

        return l.load_item()
Пример #13
0
 def parse_item(self, response):
     # sel2 = Selector(response)
     # link = sel2.xpath("//img/@src").extract()
     # image_item = ImageItem()
     # image_item['image_urls'] = link
     # yield image_item
     l = ItemLoader(item=ImageItem(), response=response)
     l.add_xpath('image_urls', "//input[@type='image']/@src", Identity())
     filePath = response.meta['item']
     l.add_value('url', response.url)
     l.add_value('filePath', filePath)
     return l.load_item()
Пример #14
0
 def parse_item(self, response):
     l = ItemLoader(item=YoumeituItem(), response=response)
     l.add_xpath('name', '//h2/a/text()')
     l.add_xpath(
         'tags',
         "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p"
     )
     l.add_xpath(
         'image_urls',
         "//div[@id='mainbox']/div[@id='canvasbox']/div[@id='content']/a[@id='item-tip']/img/@src",
         Identity())
     l.add_value('url', response.url)
     return l.load_item()
Пример #15
0
class URLListLoader(ItemLoader):
    default_output_processor = Identity()

    def __init__(self,
                 item=None,
                 selector=None,
                 response=None,
                 parent=None,
                 **context):
        super().__init__(item, selector, response, parent, **context)

        # starcity item
        self.add_css(
            "urls", "div.listItem-details > h4.listItem-title > a::attr(href)")
        # gatherer item
        self.add_css("urls", "tr.cardItem > td > a::attr(href)")
Пример #16
0
class ArticleLoader(ItemLoader):
    default_input_processor = MapCompose(remove_tags, str.strip)
    default_output_processor = TakeFirst()
    url_in = Identity()