def parse_item(self, response): l = ItemLoader(item=CrawlpictureItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_css('tags', 'div.metaRight p::text') #l.add_xpath('image_urls','//div[@id="picture"]/p/img/@src' or '//img[@class="scrollLoading"]/@src',Identity()) l.add_css('image_urls', 'div.postContent img::attr(src)', Identity()) l.add_value('url', response.url) return l.load_item()
def parse_product(self, response): p = ItemLoader(item=Product(), response=response) p.add_css('nome', 'h1 > span[itemprop=name]::text') p.add_value('url', response.url) p.add_css('descricaoLongaHtml','.infoProdBox') p.add_css('descricaoLonga','.infoProdBox') #p.add_css('detalhes','.ficha-tecnica table tr th::text, .ficha-tecnica table tr td::text') p.add_css('image','ul.a-carousel-list > li > img', re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']') p.add_css('categorias','div[class=breadcrumb-box] span[itemprop=name]::text') yield p.load_item()
def parse(self, res): l = ItemLoader(item=WeiboItem(), response=res) l.add_css("link", "link[href]") l.add_css("title", "title") l.add_css("desc", "a span") print "pachong is over!!!" return l.load_item()
def parsePost(self, response): l = ItemLoader(item=Post(), response=response) d = pyq(response.body) l.add_value('url', response.url) l.add_css('title', 'h1.entry-title::text') l.add_css('date', 'span.entry-date::text') l.add_css('author', 'span.author.vcard > a::text') l.add_value('content', d('div.entry-content').text()) return l.load_item()
def Loader_index(self, item_selector): l = ItemLoader(item={}, selector=item_selector) conver_img = l.get_xpath('.//*[@class="lz_img"]/img/@src') l.add_xpath('title', './/*[@class="k_list-lb-2"]/div[1]/a[1]/text()') l.add_xpath('url', './/*[@class="k_list-lb-2"]/div[1]/a/@href') l.add_value('preview', conver_img) l.add_css('date', '#k_list-lb-2-f::text', re=r'(\d{4}-\d{2}-\d{2})') l.add_value('image_urls', conver_img) return l.load_item()
def parse_product(self, response): p = ItemLoader(item=Product(), response=response) p.add_css('nome', 'h1 > span[itemprop=name]::text') p.add_value('url', response.url) p.add_css('descricaoLongaHtml', '.infoProdBox') p.add_css('descricaoLonga', '.infoProdBox') p.add_css('image', 'ul.a-carousel-list > li > img', re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']') p.add_css('categorias', 'div[class=breadcrumb-box] span[itemprop=name]::text') yield p.load_item()
def parse(self, response): new_albums = response.css(self.filter_css) count = 0 for new_album in new_albums: count += 1 if count > self.count_limit: raise CloseSpider('done') loader = ItemLoader(KuwoScrapyItem(), new_album) loader.add_value('basic_source_info', '{}') loader.add_css('basic_source_name', self.name_css, TakeFirst()) loader.add_css('basic_source_artist', self.artist_css, Join('&')) yield loader.load_item()
def parse_product(self, response): p = ItemLoader(item=Product(), response=response) p.add_css('nome', 'h1.livedata::text') p.add_value('url', response.url) p.add_css('descricaoLonga', '.desc-info') p.add_css('image', 'div.container-product-image a.image-link > img', re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']') p.add_css('categorias', 'span[itemprop=title]::text') yield p.load_item() #executar no mongo #db.produto.remove({'categorias.0': {$exists: false}}) #db.produto.remove({'categorias.0': {$nin: [' Games', ' Livros', ' DVDs e Blu-ray']}}) #deleta produtos duplicados #var duplicates = []; #db.produto_novo.aggregate([ #{"$group" : { "_id": "$nome", "count": { "$sum": 1 }, "dups": { "$addToSet": "$_id" }, }}, #{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } }] #,{allowDiskUse: true},{cursor:{}} #).result.forEach(function(doc) { #doc.dups.shift(); #doc.dups.forEach( function(dupId){ #duplicates.push(dupId); #} #) #}) #printjson(duplicates); #db.produto_novo.remove({_id:{$in:duplicates}})
def parse_content(self, response): '''Parse content pages.''' loader = ItemLoader(item=Rede(), response=response) # Usually, we are only interested in the first item, e.g. for title, place, etc. loader.default_output_processor = TakeFirst() # Add fields loader.add_value('link', response.url) loader.add_css('title', '.text h1', extract_text) # Test if text has an abstract abstract = response.css('.abstract') if abstract: loader.add_css('abstract', '.abstract', extract_text) loader.add_css('text', '.abstract ~ p:not(.picture)', extract_text, Join('\n')) else: loader.add_css('text', '.text p:not(.picture)', extract_text, Join('\n')) # Metadata are in dt/dd pairs. keys = response.css('dl dt::text').extract() values = response.css('dl dd::text').extract() for key, value in zip(keys, values): if key == 'Datum:': match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4})', value) if match: # '22.03.2011' format value = match.group(1) dt = datetime.strptime(value.encode(ENC), '%d.%m.%Y') else: # '22. März 2011' format dt = datetime.strptime(value.encode(ENC), '%d. %B %Y') loader.add_value('date', dt.date()) elif key == 'Ort:': loader.add_value('place', value) return loader.load_item()
def Loader_content(response): l = ItemLoader(item={}, response=response) l.add_css('title', '.k_jianjie-3a-1-name::text') l.add_value('date', l.get_xpath('//*[@class="k_jianjie-3a-2b"]/text()')[2]) #l.add_value('url',_response.url[len(self._scheme+"//"+self.allowed_domains[0]):]) l.add_css('down', '.k_jianjie-3a-5down::text', TrimAll()) conver_img = l.get_xpath('//*[@id="k_jianjie-2b"]/a/img/@src') content_img = l.get_xpath('//*[@class="content"]/p/img/@src') l.add_value('src_url', response.url) l.add_value('preview', conver_img) l.add_value('content', content_img) l.add_value('image_urls', conver_img + content_img) print('正下载图片:', conver_img + content_img) #time.sleep(len(conver_img+content_img)) return l.load_item()
def get_app(self, response): il = ItemLoader(item=PlayStoreItems(), response=response) il.add_css('app_id', '.details-wrapper::attr(data-docid)') il.add_css('name', '.document-title div::text') il.add_css('category', '.category span::text') il.add_css( 'category_url', '.category::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) il.add_css('price', '.details-actions .price span::text') il.add_css('offers_in_app_purchases', '.inapp-msg::text') il.add_css('stars_count', '.stars-count::text') il.add_css('video', '.details-trailer > span::attr(data-video-url)') il.add_css('screenshots', '.screenshot::attr(src)') il.add_xpath( 'description', '//div[contains(@class, "show-more-content")]/div//text()') il.add_css('update_date', '[itemprop="datePublished"]::text') il.add_css('file_size', '[itemprop="fileSize"]::text') il.add_css('installs', '[itemprop="numDownloads"]::text') il.add_css('current_version', '[itemprop="softwareVersion"]::text') il.add_css('requires_android', '[itemprop="operatingSystems"]::text') il.add_css('offered_by', '[itemprop="author"] > a span::text') il.add_css( 'offered_by_url', '[itemprop="author"] > a::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) yield il.load_item()
def work(self, rp): print(rp.request.headers, '+++++++++') i = ItemLoader(item=NayangJobItem(), response=rp) i.add_css('name', '.f_left>h2::text') i.add_css('company', '.gs_name2 a::text') i.add_css('education', '.clearfix em:nth-child(7)::text') i.add_css('money', '.clearfix em:nth-child(-2)::text') i.add_css('discript', '.bd') i.add_css('job_addr', '.clearfix em:nth-child(5)::text') yield i.load_item()
def parse_property(self, response): loader = ItemLoader(PropertyScrapperItem(), response=response) loader.add_css('address', '.property-address::text') loader.add_css('suburb', 'dl.cN-featDetails-extended dd.suburb a::text') # loader.add_css('description', 'div.main div.cT-productDescription') loader.add_css('sold_date', 'dl.cN-featDetails-extended dd.saleDate::text') loader.add_css('sold_price', 'dl.cN-featDetails-extended dd.price::text') loader.add_css('property_type', 'dl.cN-featDetails-extended dd.propertytype::text') loader.add_css( 'floorplan_url', '#Content_Content_propertyPhotos_FloorplanLink::attr(href)') loader.add_css('photo_url', '#Content_Content_propertyPhotos_lnkPhoto::attr(href)') loader.add_css('sales_type', 'dl.cN-featDetails-extended dd.saleType::text') # domain uses feature to represents bed + bath + parking, # we store this feature in bed, and process it later in self.process loader.add_css('bed', 'dl.s-featSummary dd p.features span::text') yield self.process(loader.load_item())
def get_app(self, response): il = ItemLoader(item=PlayStoreItems(), response=response) il.add_css('app_id', '.details-wrapper::attr(data-docid)') il.add_css('name', '.document-title div::text') il.add_css('category', '.category span::text') il.add_css('category_url', '.category::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) il.add_css('price', '.details-actions .price span::text') il.add_css('offers_in_app_purchases', '.inapp-msg::text') il.add_css('stars_count', '.stars-count::text') il.add_css('video', '.details-trailer > span::attr(data-video-url)') il.add_css('screenshots', '.screenshot::attr(src)') il.add_xpath('description', '//div[contains(@class, "show-more-content")]/div//text()') il.add_css('update_date', '[itemprop="datePublished"]::text') il.add_css('file_size', '[itemprop="fileSize"]::text') il.add_css('installs', '[itemprop="numDownloads"]::text') il.add_css('current_version', '[itemprop="softwareVersion"]::text') il.add_css('requires_android', '[itemprop="operatingSystems"]::text') il.add_css('offered_by', '[itemprop="author"] > a span::text') il.add_css('offered_by_url', '[itemprop="author"] > a::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) yield il.load_item()
def parse_titles(self, response, hub): loader = ItemLoader(item=BlogCategory(), response=response) loader.add_value('name', hub) loader.add_css('posts', 'main > article h2.entry-title > a::text') yield loader.load_item()
def parse_property(self, response): loader = ItemLoader(PropertyScrapperItem(), response=response) loader.add_css('address', '.property-address::text') loader.add_css('suburb', 'dl.cN-featDetails-extended dd.suburb a::text') # loader.add_css('description', 'div.main div.cT-productDescription') loader.add_css('sold_date', 'dl.cN-featDetails-extended dd.saleDate::text') loader.add_css('sold_price', 'dl.cN-featDetails-extended dd.price::text') loader.add_css('property_type', 'dl.cN-featDetails-extended dd.propertytype::text') loader.add_css('floorplan_url', '#Content_Content_propertyPhotos_FloorplanLink::attr(href)') loader.add_css('photo_url', '#Content_Content_propertyPhotos_lnkPhoto::attr(href)') loader.add_css('sales_type', 'dl.cN-featDetails-extended dd.saleType::text') # domain uses feature to represents bed + bath + parking, # we store this feature in bed, and process it later in self.process loader.add_css('bed', 'dl.s-featSummary dd p.features span::text') yield self.process(loader.load_item())
def parse_content(self, response): django_istance = self._Model.objects.filter(url=response.url) # django obj之前存在,并且不重抓则忽略此条 if django_istance and not self.refetch: return if django_istance: # 重抓此数据 django_istance.delete() sel = Selector(response) loader = ItemLoader(item=PaperEduItem(), response=response) # parse page loader.add_value('url', response.url) raw_html = None try: raw_html = response.body_as_unicode() except: raw_html = response.body.decode('latin-1') loader.add_value('raw_html', raw_html) for attr, css in self._CSS.iteritems(): loader.add_css(attr, css) for attr, xpath in self._XPATH.iteritems(): loader.add_xpath(attr, xpath) pub_css = '#right > div.grid_10.omega.alpha > div.r_two > div.cmtdiv .tip' tip = sel.css(pub_css) pub_date = tip.re(u'发布时间:\s*(\d+-\d+-\d+)') item = loader.load_item() # 特殊字段处理 # 站点标识 item['site_id'] = SITE_PAPER_EDU # 分类标识 title = sel.css('title::text').extract()[0] subject = title.split(' - ')[1] item['subject_id'] = SUBJECT_ID.get(subject, -1) # keywords页面不规范 for attr, xpath_correction in self._XPATH_CORRECTION.iteritems(): if not ''.join(item.get(attr, '')).strip(' ;\n'): item[attr] = sel.xpath(xpath_correction).extract()[0] try: pub_date = pub_date[0] pub_date = datetime.strptime(pub_date, self.PUB_DATE_FORMAT).date() except IndexError: pub_date = None except ValueError: pub_date = None item['pub_date'] = pub_date # transe attr for attr, value in item.iteritems(): if isinstance(value, list): item[attr] = self._JOIN.get(attr, '').join(value) # 字段替换,例如替换关键字中文逗号等 for attr,_r in self._REPLACE.iteritems(): old, new = _r item[attr] = re.sub(old, new, item[attr]) # 不规则页面元素替换,关键词中有使用空格切分和;切分的 for attr, _r in self._SPLIT_AND_JOIN.iteritems(): pattern, join_str, judge_func = _r if judge_func(item[attr]): item[attr] = join_str.join(re.split(pattern, item[attr])) return item