def parse_item(self, response): """ This function parses a property page. @url http://web:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_xpath("title", '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "price", './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(",", ""), float), re="[,.0-9]+" ) l.add_xpath("description", '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join()) l.add_xpath("address", '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip)) l.add_xpath( "image_urls", '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i)) ) # Housekeeping fields l.add_value("url", response.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("date", datetime.datetime.now()) return l.load_item()
def parse_link_page(response): for post in response.xpath('//div[@data-type="link"]'): l = ItemLoader(RedditPostItem(), selector=post) post_root_xpath = './div[contains(@class, "entry")]' title = post_root_xpath + '/p[@class="title"]' tagline = post_root_xpath + '/p[@class="tagline"]' buttons = post_root_xpath + '/ul' l.add_xpath('title', title + '/a/text()') l.add_xpath('link', title + '/a/@href') l.add_xpath('poster', tagline + '/a[contains(@class, "author")]/text()') l.add_xpath('score', './div[contains(@class, "midcol")]/div[@class="score unvoted"]/text()') l.add_xpath('number_of_comments', buttons + '//a[contains(@class, "comments")]/text()') l.add_xpath('comments_link', buttons + '//a[contains(@class, "comments")]/@href') l.add_xpath('subreddit', './@data-subreddit') l.add_xpath('post_timestamp', tagline + '/time/@datetime') l.add_value('scrape_timestamp', datetime.datetime.now()) item = l.load_item() # if there are any comments for the post, go scrape them item["comments"] = [] if item["number_of_comments"] > 0: yield scrapy.Request(item["comments_link"]+"?limit=500", callback=parse_comments, meta={'item': item}) yield l.load_item()
def parse(self, response): l=ItemLoader(item=RentalItem(),response=response) l.add_xpath('price','//*[(@id = "main-info")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-big", " " )) and contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()') l.add_xpath('adress','//*[(@id = "addressPromo")]//*[contains(concat( " ", @class, " " ), concat( " ", "txt-bold", " " ))]/text()') l.add_value('url', response.url) return l.load_item()
def parse_item(self, response): # FIXME: fix array issue i = ItemLoader(item=SalefinderItem(), response=response) title = r'//div[@id="product-details-container"]//h1/text()' price = r'//div[@id="product-details-container"]//span[@class="price"]/text()' per = r'//div[@id="product-details-container"]//span[@class="price"]/text()' image_url = r'//a[@id="product-image-container"]//img/@src' i.add_xpath('title', title, MapCompose(unicode.lower)) i.add_xpath('price', price, re=r'[,.0-9]+') i.add_xpath('per', per, re=r'pk|each|kg') i.add_xpath('image_url', image_url) i.add_value('url', response.url) i.add_value('date', date.today().isoformat()) product_buy = response.xpath("//div[@class='product-container']//div[@id='product-buy']") product_buy_text = product_buy.extract_first().lower() # Detect the vendor from a product-buy div if 'coles' in product_buy_text: i.add_value('vendor', 'coles') elif 'woolworths' in product_buy_text: i.add_value('vendor', 'woolworths') else: i.add_value('vendor', 'unknown') return i.load_item()
def parse_content_page(self, response): # Detect if this is a redirection page m = redirect_re.search(response.body) if m: import requests new_url = m.group(1) new_content = requests.get(new_url).content response = scrapy.http.HtmlResponse(new_url, body=new_content) # Start scraping il = ItemLoader(item = LuliItem(), response=response) il.add_css('content', 'div#articleNew > p::text') il.add_css('content', 'div[itemprop="articleBody"] > p') il.add_css('date', 'div#articleDate::text') il.add_css('date', 'header > time[datetime]::attr(datetime)') il.add_css('title', 'div#articleNew > h1::text') il.add_css('title', 'h1[itemprop="headline"]::text') il.add_value('url', response.url) item = il.load_item() yield item
def parse(self, response): """ This function parses the categories and its subcategories on a gscholar web page. @url https://scholar.google.com/citations?view_op=top_venues&hl=de&vq=bus @returns items 1 1 @returns requests 0 0 @scrapes name subs """ # We need the div that is 'selected' i.e. contains gs_sel as a css class title_xp = '//*[@id="gs_m_broad"]/div[contains(@class,\'gs_sel\')]/a/span/text()' item = ItemLoader(item=CategoryItem(), response=response) title = response.xpath(title_xp).extract_first() item.add_value('name', title) subs = [] for sub in response.xpath('//*[@id="gs_m_rbs"]/ul/li/a'): s = {'name' : sub.xpath('text()').extract_first()} rel_url = sub.xpath('@href').extract_first() s['vq'] = parse_qs(urlparse(rel_url).query)[u'vq'][0] subs.append(s) req = Request(urljoin(response.url,rel_url), callback=self.parse_item) req.meta['parent'] = title yield req item.add_value('subs', subs) yield item.load_item()
def parse(self, response): sites = response.xpath('//table/tbody/tr') for site in sites: url = urljoin(response.url, site.xpath("td[2]/a/@href").extract_first()) urlLast = urljoin(response.url, site.xpath("td[3]/a/@href").extract_first()) item = DeathItem() loader = ItemLoader(item,selector=site) loader.add_xpath('Mid','td[1]/text()') loader.add_xpath('firstName','td[5]/text()') loader.add_xpath('lastName','td[4]/text()') loader.add_xpath('Date','td[8]/text()') loader.add_xpath('Race','td[9]/text()') loader.add_xpath('County','td[10]/text()') loader.add_xpath('Age','td[7]/text()') loader.add_value('OILink',url) loader.add_value('OLastStatement',urlLast) if url.endswith(("jpg","no_info_available.html")): loader.add_value('Description',u'') loader.add_value('Education',u'') if urlLast.endswith("no_last_statement.html"): loader.add_value('Message',u'') yield loader.load_item() else: request = scrapy.Request(urlLast, meta={"item" : loader.load_item()}, callback =self.parse_details2) yield request else: request = scrapy.Request(url, meta={"item": loader.load_item(),"urlLast" : urlLast}, callback=self.parse_details) yield request
def parse_image(self, response): logger.info("正在收集页面数据: %s ..." % response.url) loader = ItemLoader(item=MeiTuItem(), response=response) loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/text()") loader.add_xpath('publisher', "//div[@class='width']/div[@class='c_l']/p[1]/a[@class='tags']/text()") loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/text()") loader.add_xpath('model_name', "//div[@class='width']/div[@class='c_l']/p[5]/a[@class='tags']/text()") loader.add_xpath('publishtime', "//div[@class='width']/div[@class='c_l']/p[6]/text()") loader.add_xpath('magazine_no', "//div[@class='width']/div[@class='c_l']/p[2]/text()") loader.add_xpath('pic_qty', "//div[@class='width']/div[@class='c_l']/p[3]/text()") loader.add_xpath('pixel', "//div[@class='width']/div[@class='c_l']/p[4]/text()") try: loader.add_xpath('desc', "//p[@class='buchongshuoming'/text()]") except ValueError: pass loader.add_xpath('tag', "//div[@class='fenxiang_l']/a[@class='tags']/text()") loader.add_xpath('sort', "//div[@class='weizhi']/span/a[2]/text()") loader.add_xpath('image_url', "//div[@class='content']/center/img[@class='content_img']/@src") loader.add_value("page_url", response.url) yield loader.load_item()
def parse_item(self, response): sel = response.css("div.path") loader = ItemLoader(item=SeriesItem(), selector=sel) loader.add_css("series_id", "a:last-child::attr(href)") loader.add_css("series_name", "a:last-child::text") series = loader.load_item() print(series) # 即将销售 & 在售 for sel in response.css("div.interval01-list-cars-infor"): loader = ItemLoader(item=ModelItem(), selector=sel) loader.add_css("model_id", "a::attr(href)") loader.add_css("model_name", "a::text") loader.add_value("series_id", series['series_id']) loader.add_value("series_name", series['series_name']) yield loader.load_item() # 停售 url = "http://www.autohome.com.cn/ashx/series_allspec.ashx" years = response.css(".dropdown-content a::attr(data)") for year in years.extract(): qs = { "y": year, "s": series["series_id"] } yield Request(url + "?" + urlencode(qs), self.stop_sale)
def parse_colleagues(self, response, author_id): self.logger.info('Parsing colleagues for author %s.' % author_id) # get all authors listed num_authors = 0 for div in response.xpath('//*[@class="gsc_1usr gs_scl"]'): num_authors += 1 name_xp = './*[@class="gsc_1usr_name"]/text()' id_val = urlparse.parse_qs(urlparse.urlparse(div.xpath('//*[@id="gsc_ccl"]/div[1]/div[2]/h3/a/@href').extract_first()).query)['user'] cited_by_xp = './*[@class="gsc_1_usr_cby"]/text()' fos_xp = './/a[@class="gsc_co_int"]/@href' # --> ["foo", "bar",...] # load general author item for colleague co_auth = ItemLoader(item=AuthorItem(), response=response, selector=div) co_auth.add_value('id', id_val) co_auth.add_xpath('name', name_xp) co_auth.add_xpath('cited', cited_by_xp) co_auth.add_xpath('fos', fos_xp) yield co_auth.load_item() # load co-authorship relation = [author_id, id_val] relation.sort() co_rel = ItemLoader(item=CoAuthorItem(), response=response) co_rel.add_value('author1', relation[0]) co_rel.add_value('author2', relation[1]) yield co_rel.load_item() self.logger.info('Found %d colleagues for author %s.' % (num_authors, author_id)) next_url = self.choose_next() if next_url: yield Request(url=next_url)
def _parse(self, response): l = ItemLoader(item=BookmarksItem(), response=response) l.add_xpath(u"name", u"/html/head/title") l.add_xpath(u"anchors", u"//a/@href'") l.add_xpath(u"description", u"/html/body/text()") l.add_value(u"last_updated", datetime.datetime) # you can also use literal values return l.load_item()
def parse_item(self,response): l = ItemLoader(item =MeizituItem(),response = response) l.add_xpath('name','//h2/a/text()') l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",Identity()) l.add_value('url', response.url) return l.load_item()
def parse(self, response): match = re.search('/displaySeminarList/',response.url) if match: urls = response.xpath('//div[@class="internList splitEntry"]//@href').extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url, self.parse) else: table = response.xpath(self.seminar_list_xpath) corpId = parse_qs(urlparse(response.url).query)['corpId'] for index,semi in enumerate(table): loader = ItemLoader(SeminarItem(),semi) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_value('companyid',corpId) loader.add_xpath('name','//div[@id="headerWrap"]//h3/text()') loader.add_xpath('date','.//td[@class="date"]/text()',re='\d+\/\d+\/\d+') loader.add_xpath('time','.//td[@class="time"]/text()') loader.add_xpath('area','.//td[@class="area"]/text()') loader.add_xpath('place','.//td[@class="place"]/text()') loader.add_xpath('loc_n','.//td[@class="place"]//a', re='mycom_loc\|(\d+\/\d+\/\d+\.\d+)\,\d+\/\d+\/\d+\.\d+') loader.add_xpath('loc_e','.//td[@class="place"]//a', re='mycom_loc\|\d+\/\d+\/\d+\.\d+\,(\d+\/\d+\/\d+\.\d+)') loader.add_xpath('target','.//td[@class="target"]/text()') yield loader.load_item()
def parse_info(self, response): loaderJob = ItemLoader(item=JobInfoItem(), response=response) loaderCom = ItemLoader(item=ComInfoItem(), response=response) loaderJob.add_value('url', value=response.url) loaderJob.add_xpath('job_name', '//div[@class="inner-left fl"][1]/h1/text()', TakeFirstL()) loaderJob.add_xpath('job_company', '//div[@class="inner-left fl"][1]/h2/a/text()', TakeFirstL()) loaderJob.add_xpath('job_benefits', '//div[@class="inner-left fl"][1]/div/span/text()', JoinL('|')) divs = '//ul[@class="terminal-ul clearfix"]/li' loaderJob.add_xpath('job_salary', divs, TakeFirstL(), re=u'(?<=职位月薪:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_location', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=工作地点:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_update', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=发布日期:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_nature', divs, TakeFirstL(), re=u'(?<=工作性质:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_experience', divs, TakeFirstL(), re=u'(?<=工作经验:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_miniEdu', divs, TakeFirstL(), re=u'(?<=最低学历:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_recruNums', divs, TakeFirstL(), re=u'(?<=招聘人数:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_category', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=职位类别:</span><strong>).*(?=</strong></li>)') loaderJob.add_xpath('job_desc', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), StripBlankL(), JoinL('|')) loaderJob.add_xpath('job_desc_resp', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=岗位职责|工作职责).*?(?=任职资格|岗位要求)') loaderJob.add_xpath('job_desc_req', '//div[@class="tab-inner-cont"][1]', ExtractTextL(), TakeFirstL(), re=u'(?<=任职资格|岗位要求).*?(?=。)') loaderJob.add_xpath('job_desc_loc', '//div[@class="tab-inner-cont"][1]/h2/text()', TakeFirstL()) loaderCom.add_xpath('url', '//div[@class="company-box"]/p[@class="company-name-t"]/a/@href', TakeFirstL()) loaderCom.add_xpath('com_name', '//div[@class="company-box"]/p[@class="company-name-t"]/a/text()', TakeFirstL()) divs = '//div[@class="company-box"]/ul/li' loaderCom.add_xpath('com_size', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司规模[:,:]).*') loaderCom.add_xpath('com_nature', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司性质[:,:]).*') loaderCom.add_xpath('com_industry', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司行业[:,:]).*') loaderCom.add_xpath('com_intro', '//div[@class="tab-inner-cont"][2]', ExtractTextL(), StripBlankL(), JoinL('|')) loaderCom.add_xpath('com_link', divs, ExtractTextL(), TakeFirstL(), re=u'(?<=公司主页[:,:]).*') loaderCom.add_xpath('com_address', divs, RemoveTagsL(), TakeFirstL(), re=u'(?<=公司地址[:,:])[\s\S]*(?=</strong>)') return loaderJob.load_item(), loaderCom.load_item()
def get_player_info(self, response): loader = ItemLoader(item=NFL_Player_2015(), response=response) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0] number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract() if type(number_and_position) is list: number_and_position = number_and_position[0] number = number_and_position.split()[0] position = number_and_position.split()[1] else: number = '' position = '' loader.add_value('number', number) loader.add_value('position', position) loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()') loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()') # loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()') # loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()') yield loader.load_item()
def parse_item(self, response): """ This function parses a property page. @url http://localhost:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ l = ItemLoader(item=PropertiesItem(), response=response) l.add_xpath('title', '//*[@itemprop="name"][1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath('price', '//*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(unicode.strip), Join()) l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(unicode.strip)) l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_titles(self, response): loader = ItemLoader(item=BlogCategory(), response=response) loader.add_value('hub', response.meta['hname']) loader.add_css('title', 'div.company_post h1 span::text') loader.add_css('date', 'div.published::text') loader.add_css('article', 'div.content::text') yield loader.load_item()
def parse(self, response): l = ItemLoader(item=Product(), response=response) l.add_xpath('name', '//div[@class="product_name"]') l.add_xpath('name', '//div[@class="product_title"]') l.add_xpath('price', '//p[@id="price"]') l.add_css('stock', 'p#stock]') l.add_value('last_updated', 'today') return l.load_item()
def parse_group_home_page(self, response): i = ItemLoader(item=DoubanGroupItem(), response=response) i.add_xpath('group_name', self._x_query['name']) i.add_value('group_url', response.url) i.add_xpath('group_members', self._x_query['members'], re='\((\d+)\)') i.add_xpath('relative_groups', self._x_query['relative_groups']) i.add_xpath('friend_groups', self._x_query['friend_groups']) return i.load_item()
def parse_item(self,response): l=ItemLoader(item=AskspiderItem()) l.add_xpath('q_title',"//h1[@class='ask_title']/text()",MapCompose(unicode.strip),Join()) l.add_xpath('q_time',"//span[@class='ask_time']/text()",MapCompose(unicode.strip)) l.add_xpath('q_province',"//div[@class='abouttdd']/ul/li[1]/h3/span/text()",MapCompose(unicode.strip)) l.add_value('q_link',response.url) l.add_xpath('q_user',"//a[@class='ask_username']/text()") return l.load_item()
def parse(self, response): for item in self.find_items(response): loader = ItemLoader(item=self.item_class()) for target in self.get_targets(): loader.add_value(target.name, target.get_value(item, response)) val = self.Meta.detail_path.get_value(item, response) yield gen_request(val, self.parse_details, loader.load_item())
def test_load_item_using_default_loader(self): i = TestItem() i['summary'] = u'lala' il = ItemLoader(item=i) il.add_value('name', u'marta') item = il.load_item() assert item is i self.assertEqual(item['summary'], u'lala') self.assertEqual(item['name'], [u'marta'])
def parse_content(self, response): logger.info('Dealing with images: %s', response.url) item_load = ItemLoader(item=ScrapyMeizituItem(), response=response) item_load.add_value('url', response.url) item_load.add_xpath('name', self._x_query['name']) item_load.add_xpath('tags', self._x_query['tags']) item_load.add_xpath('image_urls', self._x_query['image_urls']) return item_load.load_item()
def parse(self, response): item = ItemLoader(item=OrgItem(), response=response) item.add_value('id', self.curr) item.add_xpath('name', '//h2[@class="gsc_authors_header"]/text()') yield item.load_item() next_url = self.next_label_from_db() if next_url: yield Request(url=next_url,dont_filter=True)
def parse_content(self,response): bbsItem_loader = ItemLoader(item=BbsDmozItem(),response = response) url = str(response.url) bbsItem_loader.add_value('url',url) bbsItem_loader.add_xpath('forum',self._x_query['forum']) bbsItem_loader.add_xpath('poster',self._x_query['poster']) bbsItem_loader.add_xpath('content',self._x_query['page_content']) return bbsItem_loader.load_item()
def parse_detail(self, response): il = ItemLoader(NewsItem(), response=response) il.add_css("title", "%s::text" % self.title) il.add_css("date", "%s::text" % self.date) il.add_css("auth", "%s::text" % self.auth) il.add_css("content", "%s > p::text" % self.content) il.add_value("cate", response.meta["cate"]) return il.load_item()
def parse_member(self, response): loader = ItemLoader(item=MemberItem(), response=response) matchs = re.search(r'idDiputado%3D(\d+)', response.url) loader.add_value('id', matchs.groups()[0]) loader.add_xpath('name', '//div[@class="nombre_dip"]/text()') loader.add_xpath('term', '//div[@id="curriculum"]/div[@class="principal"]/text()') loader.add_xpath('province', '//div[@class="texto_dip"]/ul/li/div[@class="dip_rojo"]/text()') loader.add_xpath('party', '//div[@class="texto_dip"]/ul/li/div[@class="dip_rojo"]/a/text()') yield loader.load_item()
def parse_item(self, response): loader = ItemLoader(EolZhiyeItem(), response) loader.add_value('url', response.url) loader.add_value('code', response.url, re=r'/(\w+)\.shtml') loader.add_css('name', 'h1#pagetitle::text') loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "行业")]/a/text()') loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "职业")]/a/text()') loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n')) yield loader.load_item()
def parse_item(self,response): sel = Selector(response) l = ItemLoader(item =CarsItem(),response=response) name = sel.xpath('//div[@class="cartab-title"]/h2/a/text()').extract()[0].encode('utf-8') l.add_value('name',name) for link in sel.xpath('//a[@target="_blank"]/img/@src').extract(): link = link.replace('t_','u_') l.add_value('image_urls',link) # print link return l.load_item()
def parse_item_yj(self,response): l=ItemLoader(item=YjspiderItem(),response=response) l.add_xpath('yj_title',"//div[@class='ctd_head_left']/h2/text()",MapCompose(unicode.strip),Join()) l.add_xpath('yj_time',"//div[@class='w_journey']/dl/dt/span[2]/text()",MapCompose(unicode.strip)) l.add_value('yj_link',response.url) l.add_xpath('yj_looknum',"//a[@class='link_browse']/span/text()") l.add_xpath('yj_pl',"//a[@class='link_comment']/span/text()") l.add_xpath('yj_author',"//a[@id='authorDisplayName']/text()",MapCompose(unicode.strip)) l.add_xpath('yj_province',"//div[@class='breadbar_v1 cf']/ul/li[4]/a/text()") return l.load_item()
def parse_detail(self, response): # article_content = response.css('.article_content #content').extract() # article_content = response.xpath('//div[@class="article_content"]/div[@id="content"]').extract() # original_url = response.css('.article_detail a::attr(href)').extract_first() # # original_url = response.xpath('//div[@class="article_detail"]/a/@href').extract_first() tags = response.css('.article_more a::text').extract() if tags: tags = tags else: tags = '无' # # tags = response.xpath('//*[@class="article_more"]/a/text()').extract() item_loader = ItemLoader(item=ZakerItem(), response=response, dont_filter=True) item_loader.add_value('url_id', get_md5(response.url)) item_loader.add_value('article_url', response.url) item_loader.add_value('title', response.meta.get('title')) item_loader.add_value('media', response.meta.get('media')) item_loader.add_value('comments_num', response.meta.get('comments_num')) item_loader.add_value('img_url', response.meta.get('img_url')) item_loader.add_css('article_content', '.article_content #content') item_loader.add_css('original_url', '.article_detail a::attr(href)') item_loader.add_value('tags', tags) # item_loader.add_value('parse_time', datetime.datetime.now()) article_item = item_loader.load_item() yield article_item
def parse_item(self, response): loader = ItemLoader(item=SpiderItem(), response=response) content = '' try: title = response.xpath( r'//*[@class="dianzititle"]//text()').extract() date = response.xpath( r'//*[@id="InfoPickFromFieldControl"]//text()').extract_first( ) match = re.search(r'([0-9-]+)', date) if match: date = match.group(1) else: date = '1970-01-01' content = response.xpath( r'//*[@id="FreePlaceHoldersControl1"]//text()').extract() loader.add_value('date', date) loader.add_value('title', title) loader.add_value('content', content) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('date', '1970-01-01') loader.add_value('title', 'unknown') loader.add_value('content', '') finally: self.logger.info("crawled url: %s" % response.url) loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value("website", self.website) if content == '': self.logger.warning(' url: %s msg: %s' % (response.url, ' content is None')) yield loader.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', response.xpath('//h1[@class="title"]/text()').extract()) l.add_value( 'title', response.xpath( '//span[@class="articletitle_p22"]/text()').extract()) l.add_value('title', response.xpath('//h1[@class="tit_h2"]/text()').extract()) l.add_value( 'title', response.xpath('//span[@class="gog_title"]/text()').extract()) l.add_value( 'title', response.xpath('//td[@class="gog_title"]/text()').extract()) l.add_value('date', response.xpath('//div[@class="info"]/text()').extract()) l.add_value( 'date', response.xpath('//span[@class="p12 LightGray2"]/text()').extract()) l.add_value( 'date', response.xpath('//div[@class="articletime"]/text()').extract()) l.add_value( 'date', response.xpath('//body/table[5]/tr[5]/td[2]/div/text()').extract()) l.add_value( 'date', response.xpath( '//body/table[6]/tr/td/table/tr/td/table[3]/tr/td/text()'). extract()) r1 = r"\d{4}.\d{1,2}.\d{1,2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value( 'content', response.xpath('//div[@class="content"]/p/text()').extract()) l.add_value('content', response.xpath('//td[@class="p16"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="content01 p16"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="content"]/div/p/text()').extract()) l.add_value( 'content', response.xpath('//span[@class="gog_content"]/p/text()').extract()) l.add_value( 'content', response.xpath('//div[@class="content"]/p/a/text()').extract()) l.add_value( 'content', response.xpath('//td[@class="gog_content"]/p/text()').extract()) l.add_value( 'content', response.xpath( '//td[@class="gog_content"]/font/p/text()').extract()) l.add_value( 'content', response.xpath('//td[@class="p16"]/div/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_news(self, response): loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) loader.add_value('media_id', self.media_id) loader.add_value('election_id', self.election_id) #parse title title_selectors = response.css('h1.read__title::text') if not title_selectors: return loader.load_item() title = title_selectors.extract_first() loader.add_value('title', title) #parse date date_selectors = response.css('div.read__time::text') if not date_selectors: return loader.load_item() date_str = date_selectors.extract_first() # eg: Kompas.com - 10/10/2017, 13:37 WIB time_arr = filter(None, re.split('[\s,-]', date_str))[1:3] info_time = ' '.join([_(s) for s in time_arr if s]) #parse date information try: published_at_wib = datetime.strptime(info_time, '%d/%m/%Y %H:%M') except ValueError as e: raise CloseSpider('cannot_parse_date: %s' % e) #convert to utc+0 published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) #parse author name author_name_selectors = response.css( 'div.read__author::text').extract_first() if not author_name_selectors: loader.add_value('author_name', 'N/A') else: author_name = author_name_selectors loader.add_value('author_name', author_name) #parse raw content raw_content_selectors = response.css('div.read__content') if not raw_content_selectors: return loader.load_item() raw_content = raw_content_selectors.extract_first() loader.add_value('raw_content', raw_content) return loader.load_item()
def parse_activity(self, response): activity = response.xpath( 'normalize-space(.//h4[@class="ct-u-marginBottom20"])') company_names = response.xpath( './/*[contains(@class, "ct-product--tilte")]') company_addrs = response.xpath( './/*[contains(@class, "ct-product--description")]') company_contacts = response.xpath( './/div[contains(@id, "coordonnees")]') company_websites = response.xpath( './/*[contains(concat( " ", @class, " " ), concat( " ", "ct-product--description", " " ))]//a' ) for (name, addr, contact, website) in zip(company_names, company_addrs, company_contacts, company_websites): items = ItemLoader(item=YellowPageCrawlerItem()) # Activity denomination activity_name = activity.extract_first() items.add_value('activity', activity_name) # Name of the entity company_name = name.xpath( 'normalize-space(./text())').extract_first() items.add_value('name', company_name) # Address address = addr.xpath('./text()').getall() items.add_value('address', address) # Contact = Mail + Phone contact = contact.css('::text').getall() # Phone items.add_value('phone', contact) # Mail items.add_value('mail', contact) # Website company_website = website.css('::text').getall() items.add_value('website', company_website) yield items.load_item() next_page = response.css('a[aria-label=Next]::attr(href)').get() if next_page: yield SplashRequest( url=urljoin(self.base_url, next_page), callback=self.parse_activity, endpoint='execute', args={ 'lua_source': script, 'timeout': 10, 'wait': 10 }, )
def parse_item(self, response): print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') l = ItemLoader(item=PlainItem(), response=response) l.add_value('url', response.url) try: l.add_xpath('name', '/html/body/div[4]/div/div[1]/h1/text()') except: l.add_value('name', '') try: l.add_xpath('address', '/html/body/div[4]/div/div[1]/div/text()') except: l.add_value('address', '') try: l.add_xpath('build_year', '/html/body/div[6]/div[2]/div[2]/div[1]/span[2]/text()') except: l.add_value('build_year', '') try: l.add_xpath('buildings', '/html/body/div[6]/div[2]/div[2]/div[6]/span[2]/text()') except: l.add_value('buildings', '') try: l.add_xpath('familys', '/html/body/div[6]/div[2]/div[2]/div[7]/span[2]/text()') except: l.add_value('familys', '') try: l.add_xpath('area', '/html/body/div[5]/div[1]/a[3]/text()') except: l.add_value('area', '') try: l.add_value('subway', response.meta['subway']) except: l.add_value('subway', '') try: l.add_xpath('price', '/html/body/div[6]/div[2]/div[1]/div/span[1]/text()') except: l.add_value('price', '') try: l.add_xpath('estate', '/html/body/div[6]/div[2]/div[2]/div[4]/span[2]/text()') except: l.add_value('estate', '') # try: # details = response.xpath('//div[@class="p-parameter"]/ul[2]/*/text()').extract() # for i in range(len(details)): # l.add_value('item{}'.format(i), details[i]) # except: # for i in range(9): # l.add_value('item{}'.format(i), '') yield l.load_item()
def parse_item(self, response): loader = ItemLoader(item=HistoryItem(), response=response) loader.add_value('type', response.meta["type"]) loader.add_value('yiji', response.meta["yiji"]) loader.add_value('erji', response.meta["erji"]) parser = etree.HTML(response.text) root = parser.xpath("//div['answer_detail']/dl/dt")[0] if response.meta["type"] == "单选题" or response.meta["type"] == "多选题": # 整理选项 choose = [] choose_raw = root.xpath(".//td") for item in choose_raw: choose.append( util.deal_jam( util.from_choose_item_get_content( etree.tostring( item, encoding='utf-8', pretty_print=True).decode('utf-8')))) item.getparent().remove(item) # root中剔除table选项 rm_tb_list = root.xpath(".//table") for rm_item in rm_tb_list: rm_item.getparent().remove(rm_item) pass # load answer_raw = response.xpath( "//div[@class='answer_detail']/dl/dd/p[1]/i/text()" ).extract_first() answer_list = [s_item for s_item in answer_raw] answer_index_list = list() for answer in answer_list: if answer == "A" or answer == "a": answer_index_list.append(0) elif answer == "B" or answer == "b": answer_index_list.append(1) elif answer == "C" or answer == "c": answer_index_list.append(2) elif answer == "D" or answer == "d": answer_index_list.append(3) loader.add_value('answer_index', answer_index_list) loader.add_value( 'content', util.from_content_get_real_content( etree.tostring(root, encoding='utf-8', pretty_print=True, method='html').decode('utf-8'))) loader.add_value('choose', choose) pass else: loader.add_value( 'content', util.from_content_get_real_content( etree.tostring(root, encoding='utf-8', pretty_print=True, method='html').decode('utf-8'))) loader.add_value('choose', None) pass loader.add_value( 'answer', util.replace_i( response.xpath("//div[@class='answer_detail']/dl/dd/p[1]/i"). extract_first())) loader.add_value( 'analysis', util.get_full_analysis( util.replace_i( response.xpath("//div[@class='answer_detail']/dl/dd/p[2]/i" ).extract_first()))) yield loader.load_item() pass
def parse(self, response): agent = user_agent_rotator.get_random_user_agent() options.add_argument(f"user-agent={agent}") self.driver = webdriver.Chrome(str(Path(Path.cwd(), "chromedriver.exe")), chrome_options=options) # self.driver = webdriver.Firefox(executable_path=str(Path(Path.cwd(), "geckodriver.exe"))) self.driver.set_window_size(randrange(1100, 1200), randrange(800, 900)) self.driver.get( "https://www.kyero.com/en/tenerife-apartments-for-sale-0l55570g1?max_price=150000&min_beds=2&min_property_size=40&sort=popularity_desc/" ) sleep(2) body = self.driver.find_element_by_css_selector('body') body.send_keys(Keys.PAGE_DOWN) sleep(1) body.send_keys(Keys.PAGE_UP) sleep(1) body.send_keys(Keys.PAGE_DOWN) body.send_keys(Keys.HOME) sel = Selector(text=self.driver.page_source) pages = sel.xpath( './/span[@class="search-results__count"]/text()').extract()[0] pages = pages.split(" ")[0] pages = pages.replace(",", "") pages = int(pages) / 20 pages_count = int(pages) + 1 sleep(1) self.driver.quit() for page in range(1): agent = user_agent_rotator.get_random_user_agent() options.add_argument(f"user-agent={agent}") self.driver = webdriver.Chrome(str( Path(Path.cwd(), "chromedriver.exe")), chrome_options=options) self.driver.set_window_size(randrange(1100, 1200), randrange(800, 900)) self.driver.get( f"https://www.kyero.com/en/tenerife-apartments-for-sale-0l55570g1?max_price=150000&min_beds=2&min_property_size=40&page={page}&sort=popularity_desc" ) sleep(1) body = self.driver.find_element_by_css_selector('body') sleep(1) body.send_keys(Keys.END) sleep(1) body.send_keys(Keys.HOME) try: picture = self.driver.find_elements_by_css_selector('figure')[ randrange(1, 5)] hov = ActionChains(driver).move_to_element(picture) hov.perform() except: pass sel = Selector(text=self.driver.page_source) adverts = sel.xpath('//article[contains(@class, "bg-white")]') for advert in adverts: try: l = ItemLoader(item=IslandScraperItem(), selector=advert) title = advert.xpath( './/a[contains(@class, "inline-block hover-underline")]/text()' ).extract_first() link_string = advert.xpath( './/a[contains(@class, "inline-block hover-underline")]/@href' ).extract_first() link = "https://www.kyero.com" + link_string locality = title.split(" in ")[1] details = advert.xpath( './/ul[contains(@class, "flex")]/li/span/text()') price_string = advert.xpath( './/span[contains(@class, "p-5")]/text()' ).extract_first()[1:] if price_string: price = price_string.split(" ")[1] price = price[1:] price = price.replace(",", "") beds = advert.xpath( './/ul[contains(@class, "flex")]/li/span/text()' ).extract_first() size_string = advert.xpath( './/ul[@class="flex"]/li/span/text()')[-1].extract() size = size_string.split(" ")[0] date = datetime.today().strftime('%Y-%m-%d') except: pass l.add_value('title', title) l.add_value('island', "Tenerife") l.add_value('locality', locality) l.add_value('price', price) l.add_value('beds', beds) l.add_value('size', size) l.add_value('link', link) l.add_value('date', date) l.add_value('ad_type', "sale") yield l.load_item() sleep(5) self.driver.quit()
def parse(self, response): self.driver = webdriver.Chrome( 'D:/PYTHON/WebScraping/chromedriver', chrome_options=options) ## path to chromedriver on disk self.driver.get('https://linkedin.com/') ## Login handling username = self.driver.find_element_by_class_name('login-email') username.send_keys('XXXX') sleep(0.5) password = self.driver.find_element_by_id('login-password') password.send_keys('XXXXX') sleep(0.5) sign_in_button = self.driver.find_element_by_xpath( '//*[@type="submit"]') sign_in_button.click() sleep(2) for element in link_urls: l = ItemLoader(item=FaangItem(), selector=element) self.driver.get(element) ## Window scroller to discover button self.driver.execute_script("window.scrollTo(0, 1600);") try: show_more_button = self.driver.find_element_by_xpath( '//*[@class="pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar"]' ) except: sleep(1) try: self.driver.execute_script("window.scrollTo(0, 2100);") show_more_button = self.driver.find_element_by_xpath( '//*[@class="pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar"]' ) except: sleep(1) try: self.driver.execute_script("window.scrollTo(0, 2600);") show_more_button = self.driver.find_element_by_xpath( '//*[@class="pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar"]' ) except: sleep(1) try: self.driver.execute_script( "window.scrollTo(0, 3600);") show_more_button = self.driver.find_element_by_xpath( '//*[@class="pv-profile-section__card-action-bar pv-skills-section__additional-skills artdeco-container-card-action-bar"]' ) except: pass sleep(2) ## Skill & Country Extractor: try: actions = ActionChains(self.driver) actions.move_to_element(show_more_button).perform() show_more_button.click() sleep(3) sel = Selector(text=self.driver.page_source) country = sel.xpath( "normalize-space(.//h3/text())").extract_first() top_skills = sel.xpath( './/*[@class="Sans-17px-black-100%-semibold"]/text()' ).extract()[0:3] div = sel.xpath( './/div[@class="pv-skill-category-list pv-profile-section__section-info mb6 ember-view"]' ) skill_sets = [] for group in div: skill_group = group.xpath('./h3/text()').extract_first() skills = group.xpath( './/*[@class="pv-skill-category-entity__name "]/a/span/text()' ).extract() skill_set = {skill_group: skills} skill_sets.append(skill_set) l.add_value('country', country) l.add_value('top_skills', top_skills) l.add_value('skill_sets', skill_sets) except: pass yield l.load_item() self.driver.quit()
def parse_item(self, response): """ @url https://www.vinabook.com/lam-quen-thong-ke-hoc-qua-biem-hoa-p71348.html @returns items 1 @scrapes name name_unidecode price description @scrapes url project spider server date """ l = ItemLoader(item=BooksItem(), response=response) l.add_value('name', l.get_xpath('//*[@itemprop="title"]/text()')[-1]) l.add_value( 'name_unidecode', unidecode(l.get_xpath('//*[@itemprop="title"]/text()')[-1])) l.add_xpath('price', '//*[contains(@id, "discounted_price")]/span/text()', TakeFirst()) l.add_xpath('author', '//*[@itemprop="author"]/text()') l.add_value( 'description', filter(None, [ re.sub('<[^<]+?>', '', i) for i in l.get_xpath('//*[@class="full-description"]/p') ]), Join('\n')) l.add_xpath('image_uri', '//*[@itemprop="image"]/@src') # Information fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): # print(response.data['html']) item = ItemLoader(item=Hb5217Item(), response=response) url = response.url item_list = item_code(url, self.web_name, 'id=(.*?)$') item.add_value('web_name', self.web_name) item.add_value('web_code', self.name) item.add_value('url', url) item.add_value('item_code', item_list.get('item_code')) item.add_css('title', 'h3 a::text') item.add_css('amount', '.listcon em::text') item.add_css('rate', '.listcon em::text') item.add_css('period', '.listcon em::text') # item.add_css('loan_using', '::text') item.add_css('loaner_info', 'dl') item.add_css('pay_type', 'li em::text') item.add_css('progress', '#progressPrecent::text') # invest records i_v = [] invest_records_temp = '{{username={lst[3]}|rate=-1|postmoney={lst[6]}|money={lst[6]}|postdate={lst[8]}|status=全部通过}}' invest_records_format = "" tr = response.css('.box_view_4 tr').css('tr') try: for i in tr: lst = i.css('td *::text').extract() if lst: i_v.append(lst) # print(i_v) for n in i_v: invest_records_format += invest_records_temp.format(lst=n) item.add_value('invest_records', invest_records_format) item.add_value('start', i_v[0][8]) item.add_value('end', i_v[-1][8]) except Exception: self.logger.info('invest records is error %s' % url) yield item.load_item()
def parse_interview(self, response): # # date broadcast_date = response.xpath( '//div[@class="date left"]//strong/text()').get() locale.setlocale(locale.LC_TIME, 'ru_RU.UTF-8') broadcast_date = datetime.datetime.strptime(broadcast_date, u'%d %B %Y').date() # guest name guest_name = response.xpath( '//div[contains(@class, "author")]//*[@class="name"]/text()').get( ) # # guest title guest_title = response.xpath( '//div[contains(@class, "author")]//*[@class="post"]/text()').get( ) # # host name host_name = response.xpath( '//div[contains(@class, "lead")]//a//text()').get() interview_exists = session_test.query(exists().where( and_(InterviewParagraph.date == broadcast_date, InterviewParagraph.guest_name == guest_name))).scalar() if not interview_exists: text = response.xpath('//div[@class="mmplayer"]//p').getall() whole_interview = [] current_text = "" current_speaker = "" for index, paragraph in enumerate(text): # chunk_name = paragraph.xpath('name()') chunk = clean_chunk(paragraph) if len(chunk) > 1: current_speaker = chunk[0] current_text = chunk[-1] elif len(chunk) == 1: current_text += " " current_text += chunk[0] if (index + 1) < len(text): next_chunk = clean_chunk(text[index + 1]) if len(next_chunk) != 1 and len(current_text) > 0: if len(current_speaker) > 0: whole_interview.append( [index, current_speaker, current_text]) else: whole_interview[-1][2] = whole_interview[-1][ 2] + " " + current_text current_text = "" current_speaker = "" else: pass else: if len(current_text) > 0: whole_interview.append( [index, current_speaker, current_text]) # current_text = "" # current_speaker = "" # print(whole_interview) # for i in whole_interview: # # print(i) # pass entry = { "date": broadcast_date, "guest_name": guest_name, "guest_title": guest_title, "host_name": host_name, "interview": whole_interview, } if len(entry['interview']) > 10: print("=======================================") print(entry['date']) print(entry['guest_name']) print(len(entry['interview'])) for i in entry['interview']: interview_item = ItemLoader(item=InterviewItem(), response=response) interview_item.add_value('date', entry['date']) interview_item.add_value('guest_name', entry['guest_name']) interview_item.add_value('guest_title', entry['guest_title']) interview_item.add_value('host_name', entry['host_name']) interview_item.add_value('index', int(i[0])) interview_item.add_value('speaker', str(i[1])) interview_item.add_value( 'text', str(i[2]).encode('utf-8').decode('utf-8')) # interview_item.add_value('text', str(i[2]).decode('utf-8', 'ignore')) interview_item.add_value('url', response.url) item = interview_item.load_item() yield (item)
def parse(self, response): #initialize collector item which stores the website's content and meta data loader = ItemLoader(item=Collector()) loader.add_value("dl_slot", response.request.meta.get('download_slot')) loader.add_value("redirect", self.checkRedirectDomain(response)) loader.add_value("start_page", response.url) loader.add_value("start_domain", self.subdomainGetter(response)) loader.add_value("scraped_urls", [response.urljoin(response.url)]) loader.add_value("scrape_counter", 1) loader.add_value("scraped_text", [self.extractText(response)]) loader.add_value("error", "None") loader.add_value("ID", response.request.meta["ID"]) #initialize the fingerprints set which stores all fingerprints of visited websites fingerprints = set() #add the fingerprints of the start_page fingerprints.add(request_fingerprint(response.request)) #if there was an initial redirect, the new domain is added to the allowed domains domain = self.subdomainGetter(response) if domain not in self.allowed_domains: self.allowed_domains.append(domain) self.refreshAllowedDomains() #extract all urls from the page... urls = response.xpath("//a/@href").extract() + response.xpath( "//frame/@src").extract() + response.xpath( "//frameset/@src").extract() #...and safe them to a urlstack urlstack = [response.urljoin(url) for url in urls] #attach the urlstack, the loader, and the fingerprints to the response... response.meta["urlstack"] = urlstack response.meta["loader"] = loader response.meta["fingerprints"] = fingerprints #...and send it over to the processURLstack function return self.processURLstack(response)
def errorback(self, failure): loader = ItemLoader(item=Collector()) if failure.check(HttpError): response = failure.value.response loader.add_value("dl_slot", response.request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", response.status) loader.add_value("ID", response.request.meta["ID"]) yield loader.load_item() elif failure.check(DNSLookupError): request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "DNS") loader.add_value("ID", request.meta["ID"]) yield loader.load_item() elif failure.check(TimeoutError, TCPTimedOutError): request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "Timeout") loader.add_value("ID", request.meta["ID"]) yield loader.load_item() else: request = failure.request loader.add_value("dl_slot", request.meta.get('download_slot')) loader.add_value("start_page", "") loader.add_value("scraped_urls", "") loader.add_value("redirect", [None]) loader.add_value("scraped_text", "") loader.add_value("error", "other") loader.add_value("ID", request.meta["ID"]) yield loader.load_item()
def parse_race_denma(self, response): """ Parse denma page. @url https://keiba.yahoo.co.jp/race/denma/1906050201/ @returns items 1 @returns requests 1 @race_denma """ logger.info(f"#parse_race_denma: start: url={response.url}") # Parse race info logger.debug("#parse_race_denma: parse race info") loader = ItemLoader(item=RaceInfoItem(), response=response) race_id = response.url.split("/")[-2] loader.add_value("race_id", race_id) loader.add_xpath("race_round", "//td[@id='raceNo']/text()") loader.add_xpath("start_date", "//p[@id='raceTitDay']/text()[1]") loader.add_xpath("start_time", "//p[@id='raceTitDay']/text()[3]") loader.add_xpath("place_name", "//p[@id='raceTitDay']/text()[2]") loader.add_xpath("race_name", "//div[@id='raceTitName']/h1/text()") loader.add_xpath("course_type_length", "//p[@id='raceTitMeta']/text()[1]") loader.add_xpath("weather", "//p[@id='raceTitMeta']/img[1]/@alt") loader.add_xpath("course_condition", "//p[@id='raceTitMeta']/img[2]/@alt") loader.add_xpath("race_condition_1", "//p[@id='raceTitMeta']/text()[6]") loader.add_xpath("race_condition_2", "//p[@id='raceTitMeta']/text()[7]") loader.add_xpath("added_money", "//p[@id='raceTitMeta']/text()[8]") i = loader.load_item() logger.debug(f"#parse_race_denma: race info={i}") yield i # Parse race denma logger.debug("#parse_race_denma: parse race denma") for tr in response.xpath( "//table[contains(@class, 'denmaLs')]/tr[position()>1]"): loader = ItemLoader(item=RaceDenmaItem(), selector=tr) loader.add_value("race_id", race_id) loader.add_xpath("bracket_number", "td[1]/span/text()") loader.add_xpath("horse_number", "td[2]/strong/text()") loader.add_xpath("horse_id", "td[3]/a/@href") loader.add_xpath("trainer_id", "td[3]/span/a/@href") loader.add_xpath("horse_weight_and_diff", "string(td[4])") loader.add_xpath("jockey_id", "td[5]/a/@href") loader.add_xpath("jockey_weight", "td[5]/text()") loader.add_xpath("prize_total_money", "td[7]/text()[3]") i = loader.load_item() logger.debug(f"#parse_race_denma: race denma={i}") yield i # Parse link logger.debug("#parse_race_denma: parse link") for a in response.xpath("//a"): href = a.xpath("@href").get() if href.startswith("/directory/horse/") \ or href.startswith("/directory/trainer/") \ or href.startswith("/directory/jocky/"): yield self._follow_delegate(response, href) yield self._follow_delegate(response, f"/odds/tfw/{race_id}/") yield self._follow_delegate(response, f"/race/result/{race_id}/")
def parse_item(self, response): if response.status == 200: soup = bs(response.text, "lxml") print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') # 详情页抽取数据 l = ItemLoader(item=JingdongItem(), response=response) # 使用add_xpath方法,传递Item类的字段名称和对应的xpath解析语法 l.add_value('cate_key', response.meta['cate_key']) l.add_value('goods_url', response.url) try: l.add_xpath('platform', '//div[@id="logo-2014"]/a/text()') except: l.add_value('platform', '') try: l.add_xpath('shop_name', '//div[@id="popbox"]/div/div[1]/h3/a/text()') except: l.add_value('shop_name', '') try: l.add_xpath('goods_name', '//div[@class="sku-name"]/text()') except: l.add_value('goods_name', '') try: l.add_xpath( 'now_price', '//div[@class="summary-price-wrap"]/div[1]/div[2]/span/span[2]/text()' ) except: l.add_value('now_price', '') try: l.add_xpath('origin_price', '//*[@id="page_origin_price"]/text()') except: l.add_value('origin_price', '') # try: # l.add_xpath('mon_sales', '//p[@id="price"]') # except: # l.add_value('mon_sales', '') try: l.add_xpath('total_views', '//div[@id="comment-count"]/a/text()') except: l.add_value('total_views', '') # try: # l.add_xpath('stock', '//p[@id="price"]') # except: # l.add_value('stock', '') try: l.add_xpath('brand', '//ul[@id="parameter-brand"]/li/a/text()') except: l.add_value('brand', '') try: details = soup.select('.p-parameter > ul[2] > li') for i in range(len(details)): l.add_value('item{}'.format(i), details[i].string) except: for i in range(9): l.add_value('item{}'.format(i), '') yield l.load_item() elif response.status == 202: cate_key = response.meta['cate_key'] yield Request(response.url, callback=self.parse_item, dont_filter=True, meta={ 'cate_key': cate_key, })
def parse_item(self, response): """ @url http://splash:8050/render.html?&url=https://vlogtruyen.net/bokutachi-wa-hanshoku-wo-yameta.html&wait=1 @scrapes name unicode_name source image_src total_chap description chapters web_source full """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath("unicode_name", '//h1[@class="title-commic-detail"]/text()') manga.add_value("name", unidecode(manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath("image_src", '//meta[@property="og:image"]/@content') manga.add_xpath("description", '//*[@class="desc-commic-detail"]/text()', Join("\n")) chapter_xpath = '//*[@class="ul-list-chaper-detail-commic"]/li/a' chapter_source = manga.get_xpath(chapter_xpath + "/@href") chapter_name = manga.get_xpath(chapter_xpath + "/h3/text()") chapters = zip(chapter_name, chapter_source) if "Đã hoàn thành" in manga.get_xpath( '//*[@class="manga-status"]/p/text()'): manga.add_value("full", True) else: manga.add_value("full", False) manga.add_value( "total_chap", manga.get_xpath( '//*[@class="ul-list-chaper-detail-commic"]/li[1]/a/h3/text()', MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)), TakeFirst(), ), ) manga.add_value("chapters", chapters) manga.add_value("web_source", "vlogtruyen") return manga.load_item()
def parse(self, response): l = ItemLoader(item=YoutubeTrendingItem(), response=response) self.driver.get(response.url) self.driver.execute_script('window.scrollTo(1, 500);') sleep(5) self.driver.execute_script('window.scrollTo(1, 3000);') sel = Selector(text=self.driver.page_source) title = self.get_title(sel), url = self.get_url(response), views = self.get_views(sel), duration = self.get_duration(sel), likes = self.get_likes(sel), dislikes = self.get_dislikes(sel), channelName = self.get_channel_name(sel), subscribers = self.get_subscribers(sel), description = self.get_description(sel), keywords = self.get_keywords(sel), date_published = self.get_date_published(sel), date_scraped = self.get_date_scraped() tags = self.get_tags(sel), #n_comments = self.get_number_of_comments(sel), image_urls = self.get_image_url(response), comments = self.get_comments(), l.add_value('title', title) l.add_value('url', url) l.add_value('views', views) l.add_value('duration', duration) l.add_value('likes', likes) l.add_value('dislikes', dislikes) l.add_value('channelName', channelName) l.add_value('subscribers', subscribers) l.add_value('description', description) l.add_value('keywords', keywords) l.add_value('date_published', date_published) l.add_value('date_scraped', date_scraped) l.add_value('tags', tags) #l.add_value('n_comments', n_comments) l.add_value('comments', comments) l.add_value('image_urls', image_urls) yield l.load_item() #return l.load_item()
def parse_item(self, response): with open('item1.html', 'w', encoding='utf-8-sig') as f: f.write(str(response.data['html'])) print('startparseitem---------------------------------') print(response.real_url) print(response.css('.p-parameter-list *::text').extract()) l = ItemLoader(item=JiadianItem(), response=response) l.add_css('name', '.sku-name::text', TakeFirst()) l.add_css('id', '.follow::attr(data-id)', TakeFirst()) l.add_css('price', '.price::text', TakeFirst()) l.add_css('brand', '#parameter-brand a::text', TakeFirst()) l.add_css('parameter', '.p-parameter-list *::text') l.add_value('parameter', '无') l.add_css('summary_service', '#summary-service span::text', TakeFirst()) l.add_css('summary_service', '#summary-service a::text', TakeFirst()) l.add_css('add_service', '#summary-support span::text') l.add_value('add_service', '无') l.add_css('sales_promotion', '.p-promotions em.hl_red::text') l.add_css('sales_promotion', '.prom-item em.hl_red::text') l.add_value('sales_promotion', '无') l.add_css('store', '.J-hove-wrap a::text', TakeFirst()) l.add_css('store_link', '.J-hove-wrap a::attr(href)', TakeFirst()) l.add_css('store', 'strong a[clstag*="allpingjia"]::text', TakeFirst()) l.add_css('store_link', 'strong a[clstag*="allpingjia"]::attr(href)', TakeFirst()) l.add_css('commentsnum', '.J-comments-list li[clstag*="allpingjia"]::attr(data-num)', TakeFirst()) l.add_css('goodcomments', '.percent-con::text', TakeFirst()) l.add_css('goodcommentnum', '.J-comments-list li[clstag*="haoping"]::attr(data-num)', TakeFirst()) l.add_css('comment_tags', '.tag-list span::text') l.add_css('mediumcommentnum', '.J-comments-list li[clstag*="zhongping"]::attr(data-num)', TakeFirst()) l.add_css('badcommentnum', '.J-comments-list li[clstag*="chaping"]::attr(data-num)', TakeFirst()) l.add_value('store', '无') l.add_value('store_link', '无') l.add_value('commentsnum', '0') l.add_value('goodcomments', '0') l.add_value('goodcommentnum', '0') l.add_value('comment_tags', '无') l.add_value('mediumcommentnum', '0') l.add_value('badcommentnum', '0') l.add_value('summary_service', '无') l.add_value('url', response.url) l.add_value('price', response.meta['price']) l.add_value('brand', '范思哲(VERSACE)') return l.load_item()
def save_to_csv(self, response, **meta): # self.state['items_count'] = self.state.get('items_count', 0) + 1 il = ItemLoader(item=NmSosSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('company_name', meta['company_name']) il.add_value('entity_id', meta['business_id']) il.add_value('dba_name', meta['dba_name']) il.add_value('company_subtype', meta['company_subtype']) il.add_value('non_profit_indicator', meta['non_profit_indicator']) il.add_value('location_address_string', meta['location_address_string']) il.add_value('status', meta['status']) il.add_value('creation_date', meta['creation_date']) il.add_value('domestic state', meta['domestic_state']) il.add_value('period of duration', meta['peroid_of_duration']) il.add_value('business purpose', meta['business_purpose']) il.add_value('mixed_subtype', meta['officer_title']) il.add_value('mixed_name', meta['officer_name']) il.add_value('person_address_string', meta['officer_address']) il.add_value('permit_type', 'business_license') il.add_value('sourceName', 'NM_SOS') il.add_value( 'url', 'https://portal.sos.state.nm.us/BFS/online/CorporationBusinessSearch' ) return il
def parse_product(self, response): # Stops if the COUNTER reaches the maximum set value if self.COUNTER >= self.COUNT_MAX: raise scrapy.exceptions.CloseSpider( reason='COUNT_MAX value reached - {} items'.format( self.COUNT_MAX)) # Check if the product is available no_available_message = response.xpath('//h2[contains(text(), "Darn")]') if no_available_message: return [] # Create the ItemLoader object that stores each product information l = ItemLoader(item=ProductItem(), response=response) # Get the product ID (ex: 666125766) product_id = response.url.split('/')[4] l.add_value('product_id', product_id) # Get the produc Title #l.add_xpath('title', '//meta[@property="og:title"]/@content') l.add_xpath( 'title', '//div[@data-component="listing-page-title-component"]/h1/text()') #l.add_xpath('title', "//h1[@data-listing-id='{}']".format(response.url.split('/')[4])) # Get the product price l.add_xpath('price', '//*[contains(@data-buy-box-region, "price")]//p') # Get the product URL (ex: www.etsy.com/listing/666125766) l.add_value('url', '/'.join(response.url.split('/')[2:5])) # Get the product description l.add_xpath('description', '//div[@data-id="description-text"]/div/p/text()') # Get each product option and save in a list product_options = [] product_options_list = response.xpath( '//*[contains(@id, "inventory-variation-select")]') for options in product_options_list: # Get list of options temp_list = options.xpath('.//text()').extract() # Remove '\n' strings temp_list = list(map(lambda s: s.strip(), temp_list)) # Remove empty strings ('') temp_list = list(filter(lambda s: s != '', temp_list)) # Filter the 'Quantity' option if temp_list[0] != '1': # Create the final string: # example: "Select a color: White, Black, Red, Silver" product_options.append(temp_list[0] + ': ' + ', '.join(temp_list[1:])) # Separate each option with a | (pipe) symbol l.add_value('product_options', '|'.join(product_options)) # Get the product rating (ex: 4.8 ) l.add_xpath('rating', '//a[@href="#reviews"]//input[@name="rating"]/@value') # Get the number of votes (number of reviews) l.add_xpath('number_of_reviews', '//button[@id="same-listing-reviews-tab"]/span/text()') # Count the number of product images images_sel = response.xpath( '//ul[@data-carousel-pagination-list=""]/li/img/@data-src-delay' ).extract() l.add_value('count_of_images', len(images_sel)) l.add_value('images_urls', images_sel) # Get the product overview #l.add_xpath('overview', '//*[@class="listing-page-overview-component"]//li') # Get the number of people that add the product in favorites l.add_xpath( 'favorited_by', '//*[@id="item-overview"]//*[contains(@href, "/favoriters")]/text()', re='(\d+)') l.add_xpath('favorited_by', '//*[@class="listing-page-favorites-link"]/text()', re='(\d+)') l.add_xpath('favorited_by', '//a[contains(text(), " favorites")]/text()', re='(\d+)') # Get the name of the Store and location l.add_xpath('store_name', '//div[@id="listing-page-cart"]//span/text()') #l.add_xpath('store_location', '//*[@id="shop-info"]/div') #l.add_xpath('return_location', "//*[@class='js-estimated-delivery']/following-sibling::div") # Use the chosen method to get the reviews self.logger.info('Reviews scraping option: ' + str(self.reviews_opt)) # Option 3 - All reviews if self.reviews_opt == 3: # Getting all Reviews store_name = response.xpath( '//span[@itemprop="title"]//text()').extract_first() # Build the reviews URL rev_url = "https://www.etsy.com/shop/{}/reviews?ref=l2-see-more-feedback".format( store_name) data = {'itemLoader': l, 'product_id': product_id} # Go to the all reviews page yield Request(rev_url, meta=data, callback=self.parse_reviews) # Option 2 - Ajax request elif self.reviews_opt == 2: # Creating the Ajax request # Getting the session cookie get_cookie = response.request.headers['Cookie'].split( b';')[0].split(b'=') cookies = { get_cookie[0].decode("utf-8"): get_cookie[1].decode("utf-8") } # Getting the x-csrf-token headers = { 'x-csrf-token': response.xpath("//*[@name='_nnc']/@value").extract_first() } # Shop Id shop_id = response.xpath("//*[@property='og:image']/@content" ).extract_first().split('/')[3] formdata = { 'stats_sample_rate': '', 'specs[reviews][]': 'Listzilla_ApiSpecs_Reviews', 'specs[reviews][1][listing_id]': product_id, 'specs[reviews][1][shop_id]': shop_id, 'specs[reviews][1][render_complete]': 'true' } data = {'itemLoader': l, 'product_id': product_id} ajax_url = "https://www.etsy.com/api/v3/ajax/bespoke/member/neu/specs/reviews" yield scrapy.FormRequest(ajax_url, headers=headers, cookies=cookies, meta=data, formdata=formdata, callback=self.parse_ajax_response) # Option 1 else: # Dict that saves all the reviews data reviews_data = [] reviews_counter = 1 # Get the data from each review all_reviews = response.xpath( '//*[@class="listing-page__review col-group pl-xs-0 pr-xs-0"]') # Process each review for r in all_reviews: # Get the profile URL of the reviewer reviewer_profile = r.xpath( ".//*[@class='display-block']/parent::*//@href" ).extract_first() if reviewer_profile: # Build the full profile url reviewer_profile = 'www.etsy.com' + reviewer_profile else: # If the profile is inactive there is no profile url continue review_date = r.xpath( ".//*[@class='text-link-underline display-inline-block mr-xs-1']/parent::*//text()" ).extract()[2].strip() reviewer_rating = r.xpath( './/input[@name="rating"]/@value').extract_first() review_content = " ".join( r.xpath('.//div[@class="overflow-hidden"]//text()'). extract()).strip() # Build the review string rev_data = "Review number: {} \nProfile: {} \nRating: {} \nDate: {} \nContent: {}".format( reviews_counter, reviewer_profile, reviewer_rating, review_date, review_content) # Save into the list reviews_data.append(rev_data) reviews_counter += 1 # Saves all reviews data l.add_value('reviews', "\n\n".join(reviews_data)) # Increment the items counter self.COUNTER += 1 print('\nProducts scraped: {}\n'.format(self.COUNTER)) yield l.load_item()
def parse_odds_win_place(self, response): """ Parse odds win place page. @url https://keiba.yahoo.co.jp/odds/tfw/1906050201/?ninki=0 @returns items 1 @returns requests 1 @odds_win_place """ logger.info(f"#parse_odds_win_place: start: url={response.url}") race_id = response.url.split("/")[-2] # Parse odds win place for tr in response.xpath("//table[@class='dataLs oddTkwLs']/tbody/tr"): if len(tr.xpath("th")) > 0: continue loader = ItemLoader(item=OddsWinPlaceItem(), selector=tr) loader.add_value("race_id", race_id) loader.add_xpath("horse_number", "td[2]/text()") loader.add_xpath("horse_id", "td[3]/a/@href") loader.add_xpath("odds_win", "td[4]/text()") loader.add_xpath("odds_place_min", "td[5]/text()") loader.add_xpath("odds_place_max", "td[7]/text()") i = loader.load_item() logger.debug(f"#parse_odds_win_place: odds_win_place={i}") yield i # Parse odds bracket quinella for tr in response.xpath("//table[@class='oddsLs']/tbody/tr"): th = tr.xpath("th") if "class" in th.attrib: bracket_number_1 = th.xpath("div/text()").get() else: loader = ItemLoader(item=OddsBracketQuinellaItem(), selector=tr) loader.add_value("race_id", race_id) loader.add_value("bracket_number_1", bracket_number_1) loader.add_value("bracket_number_2", th.xpath("text()").get()) loader.add_xpath("odds", "td/text()") i = loader.load_item() logger.debug( f"#parse_odds_win_place: odds_bracket_quinella={i}") yield i # Parse link logger.debug("#parse_odds_win_place: parse link") for a in response.xpath("//a"): href = a.xpath("@href").get() if not href: # hrefがNoneである場合がある continue if href.startswith("/odds/ut/") \ or href.startswith("/odds/ur/") \ or href.startswith("/odds/wide/") \ or href.startswith("/odds/st/") \ or href.startswith("/odds/sf/"): yield self._follow_delegate(response, href)
def parse_contents(self, response): item_loader = ItemLoader(item=ReCrawlerItem(), response=response) item_loader.add_xpath('title', '//font[@class="headtitle"]/text()') item_loader.add_value('url', response.url) return item_loader.load_item()
def get_news(self, response): try: data = response.xpath('//div[@id="textBox"]') content = data.xpath('string(.)').extract_first() item = response.meta['item'] item['content'] = content[0:content.find(u'分享到:')] item['collection_name'] = self.name item['website'] = self.website yield item except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) yield l.load_item()
def parse_item(self, response): l = ItemLoader(item=RentslamItem(), response=response) #All data must be extracted using XPATH queries image_url = response.xpath( '//*[@class="carousel-inner"]//@src').extract_first() url = response.url price = response.xpath( './/div[contains(@class,"aanbod-info-price")]/text()' ).extract_first() bedrooms = response.xpath( './/p[contains(@class,"aanbod-ifo-rooms")]/text()').extract_first( ) size = response.xpath( './/p[contains(@class,"aanbod-ifo-squarefeet")]/text()' ).extract_first() address = response.xpath( './/h1[contains(@class,"aanbod-ifo-street")]/text()' ).extract_first() text_list = response.xpath( './/div[contains(@class,"wpb_wrapper")]/text()').extract() text = (''.join(text_list)).strip() # Furnishing in Dutch Oplevering furnishing = response.xpath( './/p[contains(@class,"aanbod-ifo-furniture")]/text()' ).extract_first() #Full url (mandatory) l.add_value('ImageUrl', image_url) #Full url (mandatory) l.add_value('Url', url) #Price must not include currency symbol, dot or comma. Decimals must be filtered out. Example: € 1.348,77 --> 1348 (mandatory) l.add_value('Price', price, Join(''), re=r'\d+') #Number (if present). Bedrooms is "Slaapkamers" in Dutch l.add_value('Bedrooms', bedrooms, TakeFirst(), re=r'\d+') #Size must include only the number. Things like "m2" must be filtered out. Example: 90 m2 --> 90 (if present) l.add_value('Size', size, TakeFirst(), re=r'\d+') #The address must contain the street name (mandatory) and the house number (if it is present). It must not contain the city name or the postcode l.add_value('Address', address) #This is the description of the listing (if present) l.add_value('Text', text) #You can copy the email address from the website here (if present) l.add_value('ContactEmailAddress', '*****@*****.**') #You can copy the phone number from the website here (if present) l.add_value('ContactPhoneNumber', '+31 20 672 33 31') l.add_value('Furnishing', furnishing.replace('Oplevering:', '').strip()) l.add_value('City', 'Amsterdam') yield l.load_item()
def parse_horse(self, response): """ Parse horse page. @url https://keiba.yahoo.co.jp/directory/horse/2017101602/ @returns items 1 1 @returns requests 0 0 @horse """ logger.info(f"#parse_horse: start: url={response.url}") horse_id = response.url.split("/")[-2] # Parse horse loader = ItemLoader(item=HorseItem(), response=response) loader.add_value("horse_id", horse_id) loader.add_xpath("gender", "string(//div[@id='dirTitName']/p)") loader.add_xpath("name", "//div[@id='dirTitName']/h1/text()") loader.add_xpath("birthday", "//div[@id='dirTitName']/ul/li[1]/text()") loader.add_xpath("coat_color", "//div[@id='dirTitName']/ul/li[2]/text()") loader.add_xpath("trainer_id", "//div[@id='dirTitName']/ul/li[3]/a/@href") loader.add_xpath("owner", "//div[@id='dirTitName']/ul/li[4]/text()") loader.add_xpath("breeder", "//div[@id='dirTitName']/ul/li[5]/text()") loader.add_xpath("breeding_farm", "//div[@id='dirTitName']/ul/li[6]/text()") tdBloodM = response.xpath( "//table[@id='dirUmaBlood']/tr/td[@class='bloodM']/text()") loader.add_value("parent_horse_name_male_1", tdBloodM[0].get()) loader.add_value("parent_horse_name_male_21", tdBloodM[1].get()) loader.add_value("parent_horse_name_male_31", tdBloodM[2].get()) loader.add_value("parent_horse_name_male_32", tdBloodM[3].get()) loader.add_value("parent_horse_name_male_22", tdBloodM[4].get()) loader.add_value("parent_horse_name_male_33", tdBloodM[5].get()) loader.add_value("parent_horse_name_male_34", tdBloodM[6].get()) tdBloodF = response.xpath( "//table[@id='dirUmaBlood']/tr/td[@class='bloodF']/text()") loader.add_value("parent_horse_name_female_31", tdBloodF[0].get()) loader.add_value("parent_horse_name_female_21", tdBloodF[1].get()) loader.add_value("parent_horse_name_female_32", tdBloodF[2].get()) loader.add_value("parent_horse_name_female_1", tdBloodF[3].get()) loader.add_value("parent_horse_name_female_33", tdBloodF[4].get()) loader.add_value("parent_horse_name_female_22", tdBloodF[5].get()) loader.add_value("parent_horse_name_female_34", tdBloodF[6].get()) i = loader.load_item() logger.debug(f"#parse_horse: horse={i}") yield i
def parse_item(self, response): logging.info(u"start crawl ---> " + response.url) item = ItemLoader(item=NewsItem(), response=response) sel = Selector(response) content = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p') article_time = content.xpath('//span[@class="pubTime"]/text()').extract() date_time = compare_time(article_time, u"%Y年%m月%d日%H:%M") if not date_time: return item.add_xpath('keywords', "//head/meta[@name='keywords']/@content") item.add_value('date_time', date_time) item.add_xpath('title', '//div[@class="hd"]/h1/text()') item.add_xpath('reading_number', '//em[@id="top_count"]/text()') item.add_xpath('author', '//span[@class="auth"]/text()') item.add_value('original_link', response.url) elements = sel.xpath('//div[@id="Cnt-Main-Article-QQ"]/p').extract() images, content = translate_content(elements) if images: item.add_value('image_url', hashlib.sha1(images[0]).hexdigest() + ".jpg") item.add_value('content', content) item.add_value('image_urls', images) item.add_value('source', u'腾讯科技') item.add_value('category', CATEGORY.TECHNOLOGY) logging.info(u"finished crawl ---> " + response.url) yield item.load_item()
def parse_item(self, response): loader = ItemLoader(item=SpiderItem(), response=response) date = '1970-01-01' content = '' try: title = response.xpath( r'//*[@class="infor_border"]/h1//text()').extract() date_raw = response.xpath( r'//*[@class="right_sc"]//text()').extract() if date_raw is not None: date_raw = removern(date_raw) date = date_raw.strip().split(" ")[0] content = response.xpath( r'//*[@class="news_content"]/p//text()').extract() loader.add_value('date', date) loader.add_value('title', title) loader.add_value('content', content) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('date', date) loader.add_value('title', 'unknown') loader.add_value('content', '') finally: self.logger.info("crawled url: %s" % response.url) loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value("website", self.website) if content == '': self.logger.warning(' url: %s msg: %s' % (response.url, ' content is None')) yield loader.load_item()
def parse_item(self, response): """ This function parses a property page. @url http://web:9312/properties/property_000000.html @returns items 1 @scrapes title price description address image_urls @scrapes url project spider server date """ # Create the loader using the response l = ItemLoader(item=PropertiesItem(), response=response) # Load fields using XPath expressions l.add_value('title', response.meta['title'], MapCompose(str.strip, str.strip)) l.add_xpath('price', './/*[@itemprop="price"][1]/text()', MapCompose(lambda i: i.replace(',', ''), float), re='[,.0-9]+') l.add_xpath('description', '//*[@itemprop="description"][1]/text()', MapCompose(str.strip), Join()) l.add_xpath('address', '//*[@itemtype="http://schema.org/Place"][1]/text()', MapCompose(str.strip)) l.add_xpath('image_urls', '//*[@itemprop="image"][1]/@src', MapCompose(lambda i: urlparse.urljoin(response.url, i))) # Housekeeping fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): loader = ItemLoader(item=SpiderItem(), response=response) content = '' try: title = response.xpath(r'//*[@class="summi"]/h1//text()').extract() date = response.xpath( r'//*[@class="summi"]/ul/li[1]//text()').extract() if date is None: date = date.split(" ")[0].strip() else: date = '1970-01-01' content = response.xpath(r'//*[@class="summs"]//text()').extract() loader.add_value('date', date) loader.add_value('title', title) loader.add_value('content', content) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('date', '1970-01-01') loader.add_value('title', 'unknown') loader.add_value('content', '') finally: self.logger.info("crawled url: %s" % response.url) loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value("website", self.website) if content == '': self.logger.warning(' url: %s msg: %s' % (response.url, ' content is None')) yield loader.load_item()