def parse_sesja(self, response): # uchwaly uchwaly_le = LinkExtractor(allow=FindReportsSpider.UCHWALA_RE, restrict_xpaths="//table") links = uchwaly_le.extract_links(response) self.print_links("uchwaly", links) cnt = 0 for link in links: yield scrapy.Request(link.url, callback=self.parse_uchwala) k = items.PageItem() k["text"] = link.text.encode("utf8") k["url"] = link.url k["ref"] = response.url k["order"] = cnt yield k if cnt >= DEBUG_CNT and DEBUG: break cnt += 1 # files (glosowania, obecnosc) le = LinkExtractor(allow=FindReportsSpider.PLIK_RE) links = le.extract_links(response) self.print_links("glosowania", links) cnt = 0 for link in links: fi = items.FiledownloadItem() fi["file_urls"] = [link.url] fi["text"] = link.text.encode("utf8") fi["url"] = link.url fi["ref"] = response.url fi["order"] = cnt yield fi if cnt >= DEBUG_CNT and DEBUG: break cnt += 1
def parse(self, response): #提取书籍页面中每本书的链接 le = LinkExtractor(restrict_css='article.product_pod h3') for link in le.extract_links(response): yield scrapy.Request(link.url, callback=self.parse_book) #提取下一页的链接 le = LinkExtractor(restrict_css='ul.pager li.next') links = le.extract_links(response) if links: next_url = links[0].url yield scrapy.Request (next_url, callback=self.parse)
class MySpider(scrapy.Spider): # Your spider definition name="fetch_data" def __init__(self, *args, **kwargs): super(MySpider, self).__init__(*args, **kwargs) self.start_urls = [kwargs.get('start_url')] self.link_extractor = LinkExtractor() urls = self.start_urls def parse(self, response): item = WebpageScraperItem() item['key'] = self.start_urls item['title'] = response.xpath('//title/text()').extract() item['paragraphs'] = response.xpath('//p/text()').extract() item['headings'] = response.xpath('//h1/text()').extract() links = self.link_extractor.extract_links(response) item['links'] = [x.url for x in links] img_urls = [] img_url = response.xpath('//img/@src').extract() for img in img_url: parse_url = urlparse.urlparse(img) parsed_url = parse_url._replace(**{"scheme":"http"}) img_urls.append(parsed_url.geturl()) item['image_urls'] = img_urls return item
def parse_state(self, response): """ Yields a scrapy.Request object for each city with a store in the state """ state_url = 'stores.joann.com/{}*'.format(response.meta['state']) extractor = LinkExtractor(allow=state_url) for link in extractor.extract_links(response): yield scrapy.Request(link.url, callback=self.parse_city, headers=HEADERS)
class BCSpider(Spider): name = 'bc' def __init__(self, *args, **kwargs): super(BCSpider, self).__init__(*args, **kwargs) self.le = LinkExtractor() def parse(self, response): if not isinstance(response, HtmlResponse): return for link in self.le.extract_links(response): r = Request(url=link.url) r.meta.update(link_text=link.text) yield r @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(BCSpider, cls).from_crawler(crawler, *args, **kwargs) spider._set_crawler(crawler) spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) return spider def spider_idle(self): self.log("Spider idle signal caught.") raise DontCloseSpider
def parse(self, response): le = LinkExtractor() for link in le.extract_links(response): yield scrapy.Request(link.url, self.parse_link, meta={ 'splash': { 'args': {'har': 1, 'html': 0}, } })
def parse_link(self, response): # log self.logger.info('Hi, this is an item page! %s', response.url) # parse link linkExtractor = LinkExtractor(allow=r".+\.shtml", restrict_css='div.list > ul', unique=True) links = linkExtractor.extract_links(response) for link in links: yield scrapy.Request(link.url, callback=self.parse_content)
def parse(self,response): extractor = LinkExtractor(allow="/article/*") links = extractor.extract_links(response) for link in links: item = XiubaiItem() req = Request(link.url, self.parse_detail_page) req.meta['item'] = item yield req
def parse(self, response): # ❶取得したページのURLを出力 print(response.url) # ❷ページ内のリンクを抽出するLinkExtractorオブジェクトを作成 le = LinkExtractor() # ❸ページ内のリンクを抽出 for link in le.extract_links(response): # ❹抽出したリンクからRequestオブジェクトを生成して返す yield response.follow(link.url, self.parse)
def parse(self, response): le_area = LinkExtractor( allow=r'/[a-zA-Z0-9]+/$', restrict_xpaths="//div[@data-role='ershoufang']/div[1]") links = le_area.extract_links(response) # 爬取区域信息 for link in links: yield scrapy.Request(link.url, callback=self.parse_node)
def parse(self, response): keyword = self._parse_keyword(response.url) le = LinkExtractor(allow=r'http://detail.zol.com.cn/cell_phone/.*') for link in le.extract_links(response): item = BaiduSearchItem() item['keyword'] = keyword item['link'] = link.url item['name'] = link.text yield item
def parse(self,response): # 解析list链接 pattern = "https://list\.jd\.com/list\.html\?cat=.*" le = LinkExtractor(allow=pattern) links = le.extract_links(response) print("发现list页面共:【%s】" %len(links)) for i in links: print("-------------------->%s" %i.url) yield scrapy.Request(i.url,callback=self.next_page)
def parse(self, response): link = LinkExtractor(allow=r'http://www\.taitung\.gov\.tw/opendata/OD_OpenData_DealData.aspx\?s=\w+') links = link.extract_links(response) for lin in links: i = {} i["link"] = lin.url i["title"] = lin.text response = requests.get(i["link"]) self.getDataByRequest(response,i)
def parse_region(self, response): print('parse_region response.url:' + response.url) self.logger.debug('parse_region response.url:' + response.url) yield Request(response.url, callback=self.parse_list) le = LinkExtractor(restrict_css='div.item-list.area-bd > div.filter-sub') print('2' * 40) for link in le.extract_links(response): print(link, link.url, link.text) yield Request(link.url, callback=self.parse_price)
def parse(self, response): le = LinkExtractor(restrict_css="div.toctree-wrapper.compound", deny='/index.html$') # print(len(le.extract_links(response))) for link in le.extract_links(response): yield scrapy.Request( link.url, callback=self.parse_detail, )
def parse(self, response): print('parse response.url:' + response.url) self.logger.debug('parse response.url:' + response.url) yield Request(response.url, callback=self.parse_list) le = LinkExtractor(restrict_css='.search-area-detail') print('1' * 20) for link in le.extract_links(response): print(link, link.url, link.text) yield Request(link.url, callback=self.parse_region)
def parse(self, response): e = LinkExtractor() urls = [link.url for link in e.extract_links(response)] for url in urls: parsed = urlparse.urlsplit(url) qs = urlparse.parse_qs(parsed.query) if qs and 'Url' in qs: event_url = qs['Url'][0] yield self.add_url(event_url)
def parse(self, response): #le = LinkExtractor(restrict_css='div.toctree-wrapper.compound li.toctree-l2') le = LinkExtractor( restrict_css='div.toctree-wrapper.compound li.toctree-l1', deny='/index.html$') #pdb.set_trace() for link in le.extract_links(response): #print link.url yield scrapy.Request(link.url, callback=self.parse_url)
def parse(self, response): story_link_regex = 'http://prntly\.com/[0-9]{,4}/[0-9]{,2}/[0-9]{,2}/[a-z\-]+/' page_link_regex = 'http://prntly.com/page/[0-9]+/' story_link_extractor = LinkExtractor(canonicalize=True, unique=True, allow=story_link_regex) story_links = story_link_extractor.extract_links(response) with open('prntly.com.txt', 'a') as f: for link in story_links: f.write(link.url + '\n') page_link_extractor = LinkExtractor(canonicalize=True, unique=True, allow=page_link_regex) page_links = page_link_extractor.extract_links(response) for link in page_links: yield scrapy.Request(url=link.url, callback=self.parse)
def parse_region(self, response): print('parse_region response.url:' + response.url) self.logger.debug('parse_region response.url:' + response.url) yield Request(response.url, callback=self.parse_list) le = LinkExtractor(restrict_css='#region-nav-sub') print('2' * 100) for link in le.extract_links(response): print(link, link.url, link.text) yield Request(link.url, callback=self.parse_classfy)
def parse(self, response): le = LinkExtractor(restrict_xpaths='/html/body/div[2]/div[4]/ul/li/a') le_list = le.extract_links(response) count = len(le_list) for i in range(count): print(le_list[i - 1]) yield scrapy.Request(le_list[i - 1].url, callback=self.parsemore, dont_filter=True)
def parse_hall(self, response): print('parse_hall response.url:' + response.url) self.logger.debug('parse_hall response.url:' + response.url) yield Request(response.url, callback=self.parse_list) le = LinkExtractor(restrict_css='div.filter-mod > div:nth-child(3) > div') print('4' * 160) for link in le.extract_links(response): print(link, link.url, link.text) yield Request(link.url, callback=self.parse_list)
def parse(self, response): if response.status != 200 or response.body == "": return ads_links = response.xpath("//a[img]") for ads_link in ads_links: link_href = ads_link.xpath("@href").extract_first() if self._from_same_site(response.url, link_href): continue ads_profile = AdsProfileItem() ads_profile["ads_host"] = response.url ads_profile["ads_present_mode"] = "normal_1" ads_profile["ads_target_url"] = link_href img_src = response.urljoin(ads_link.xpath("img/@src").extract_first()) ads_profile["ads_content_url"] = img_src ads_profile["ads_content_frame"] = "" ads_profile["ads_host_domain"] = urlparse(response.url).netloc ads_profile["ads_target_domain"] = urlparse(link_href).netloc yield ads_profile if isinstance(response, SplashJsonResponse): if "childFrames" in response.data: frames = self._get_all_child_frames(response) print "Get %s childFrames in %s" % (len(frames), response.url) for frame_response in frames: if not self._is_valid_frame(frame_response.url): continue ads_links = frame_response.xpath("//a[img]") for ads_link in ads_links: link_href = ads_link.xpath("@href").extract_first() if self._from_same_site(response.url, link_href): continue ads_profile = AdsProfileItem() ads_profile["ads_host"] = response.url ads_profile["ads_present_mode"] = "normal_1" ads_profile["ads_target_url"] = link_href img_src = frame_response.urljoin(ads_link.xpath("img/@src").extract_first()) ads_profile["ads_content_url"] = img_src ads_profile["ads_content_frame"] = frame_response.url ads_profile["ads_host_domain"] = urlparse(response.url).netloc ads_profile["ads_target_domain"] = urlparse(link_href).netloc yield ads_profile link_extractor = LinkExtractor() all_links = link_extractor.extract_links(response) for link in all_links: request = SplashRequest( response.urljoin(link.url), self.parse, endpoint="render.json", slot_policy=SlotPolicy.PER_DOMAIN, args={"html": 1, "iframes": 1}, ) request.headers.setdefault("User-Agent", self.ua_generater.get_user_agent()) yield request
def parse(self, response): for le in response.css('.content'): url = le.xpath('./h2/a/@href').extract_first() yield scrapy.Request(url, callback=self.pares_detail) le = LinkExtractor(restrict_css='.current+a') links = le.extract_links(response) if links: next_url = links[0].url yield scrapy.Request(next_url, callback=self.parse)
def parse_classfy(self, response): print('parse_classfy response.url:' + response.url) self.logger.debug('parse_classfy response.url:' + response.url) yield Request(response.url, callback=self.parse_list) le = LinkExtractor(restrict_css='#classfy') print('3' * 150) for link in le.extract_links(response): print(link, link.url, link.text) yield Request(link.url, callback=self.parse_list)
def parse_nvyou_info(self, response): le = LinkExtractor(restrict_xpaths='//*[@id="waterfall"]') links = le.extract_links(response) for link in links: yield scrapy.Request(url=link.url, callback=self.parse_info) nextpage = response.xpath('//*[@id="next"]/@href').extract_first() if nextpage: nextpage = self.web + nextpage yield scrapy.Request(url=nextpage, callback=self.parse_nvyou_info)
def parse(self, response): ''' 解析页面中每本书的详细连接,以及下一页的连接 :param response: :return: ''' # 获取每本书的详细连接 le = LinkExtractor(restrict_css='article.product_pod h3') links = le.extract_links(response) if links: for link in links: yield scrapy.Request(url=link.url, callback=self.parse_book) # 获取下一页的连接, 就一个链接 le = LinkExtractor(restrict_xpaths='//li[@class="next"]/a') if le.extract_links(response): next_url = le.extract_links(response)[0].url yield scrapy.Request(url=next_url, callback=self.parse)
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = SportsItem() li = response.css('#shop-all-list>ul>li') print('parse_list li:{} response.url: {}'.format( li.css('.txt>.tit h4::text').extract(), response.url)) self.logger.debug('parse_list li:{} response.url: {}'.format( li.css('.txt>.tit h4::text').extract(), response.url)) for i in li: item['title'] = i.css('.txt>.tit h4::text').extract_first().strip() item['url'] = i.css('.txt>.tit>a::attr(href)').extract_first() if i.css('.shop-branch::text'): item['branch'] = i.css( '.shop-branch::attr(href)').extract_first() item['img'] = i.css('img::attr(data-src)').extract_first() item['star'] = float( i.css('.sml-rank-stars::attr(class)').re_first( r'[1-9]\d*|0')) / 10 if i.css('.review-num b::text'): print('review-num : {}'.format( i.css('.review-num>b::text').extract_first())) item['review_num'] = int( i.css('.review-num>b::text').extract_first()) if i.css('.mean-price b::text'): item['mean_price'] = int( i.css('.mean-price b::text').extract_first().strip('¥')) print('1111111 score environment service: {}'.format( i.css('.comment-list b::text').extract())) if i.css('.comment-list b::text').extract(): print('222222 score environment service: {}'.format( i.css('.comment-list b::text').extract())) item['score'] = float( i.css('.comment-list b::text').extract()[0]) item['environment'] = float( i.css('.comment-list b::text').extract()[1]) item['service'] = float( i.css('.comment-list b::text').extract()[2]) print('type location 1: {}'.format( i.css('.tag-addr span::text').extract())) item['type'] = i.css('.tag-addr span::text').extract()[0].strip() item['location'] = i.css( '.tag-addr span::text').extract()[1].strip() item['address'] = i.css('.addr::text').extract_first().strip() getlocation(item) item['number'] = item['url'].split('/')[-1] yield item le = LinkExtractor(restrict_css='div.page > a.next') print('4' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)
class LinkSpider(scrapy.Spider): name = 'links' def __init__(self, *args, **kwargs): self.start_url = kwargs['start_urls'][0] self.data_paths = get_data_paths(self.start_url) self.main_domain = [self.data_paths['domain']] self.start_urls = [self.data_paths['base_url']] kwargs['start_urls'] = self.start_urls self.max_to_scrap = int(kwargs.get('max_to_scrap', 10)) print(f'Start url: {self.start_urls[0]}, Domain: {self.main_domain}, Max to positives: {self.max_to_scrap}') #self.exported_data_path = os.path.join(DATA_OUTPUT_PATH, domain) #elf.main_domain = kwargs.get('main_domain') self.num_scraped = 0 self.link_extractor = LinkExtractor(allow_domains=self.main_domain) super(LinkSpider, self).__init__(*args, **kwargs) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(LinkSpider, cls).from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped) return spider def item_scraped(self, item): self.num_scraped = self.num_scraped + 1 #def closed(self, reason): # Path(self.data_paths['done']).touch() def parse(self, response): if self.num_scraped >= self.max_to_scrap: # Stop yielding request and items to stop the crawling return depth = response.meta['depth'] print(f'current url: {response.url}, depth: {depth}') headers = response.headers # print(headers['Content-Type']) if 'text/html' in str(headers.get('Content-Type', '')): # Link extractor by default avoids most extensions # but sometimes extensions are not part of the URL # For now I put all under this if, but a better way is to # implement a Download Middleware (way to go is add more logic to avoid certain content) extracted_links = self.link_extractor.extract_links(response) for link in extracted_links: item = LinkItem() item['link'] = link.url item['text'] = re.sub(r'\s+', ' ', link.text) item['depth'] = depth yield item # print(f'extracted_link: {link.url}, text: {link.text}') # Here we can add more conditions to discard URLs we don't want # explore further. yield scrapy.Request( response.urljoin(link.url), callback=self.parse)
def parse_region(self, response): print('parse_region response.url:' + response.url) self.logger.debug('parse_region response.url:' + response.url) yield Request(response.url, callback=self.parse_list) le = LinkExtractor(restrict_css='#qySelectSecond') print('2' * 40) for link in le.extract_links(response): print(link, link.url, link.text) yield Request(link.url, callback=self.parse_list)
def parse_region(self, response): print('parse_region response.url:' + response.url) self.logger.debug('parse_region response.url:' + response.url) yield Request(response.url, callback=self.parse_list_first) le = LinkExtractor(restrict_css='div[data-role="ershoufang"] > div:nth-child(2)') print('2' * 40) for link in le.extract_links(response): print(link, link.url, link.text) yield Request(link.url, callback=self.parse_list_first)
def parse(self, response): print('parse response.url:' + response.url) self.logger.debug('parse response.url:' + response.url) yield Request(response.url, callback=self.parse_list) le = LinkExtractor(restrict_css='.sub-filter-wrapper') print('1' * 50) for link in le.extract_links(response): print(link, link.url, link.text) yield Request(link.url, callback=self.parse_list_first)
class OffersPlusSpider(scrapy.Spider): name = 'offers-plus' allowed_domains = ['offers-plus.com'] start_urls = ['http://offers-plus.com/'] url = 'http://www.offers-plus.com/categories.php?category=Clothing-%2C-Shoes-%26-Apparel&page={}&sort=newest' headline_xpath = '//*[@id="ProductDetails"]/div/h2/text()' img_selector = '[class^="ProductThumb"]' img_xpath = 'a/img/@src' date_grid_selector = '[class^="ProductDetailsGrid"]' date_xpath = 'dd[5]/text()' domain = 'offers-plus.com' MAX_ENTRIES = settings.MAX_ENTRIES def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.extractor = LinkExtractor( allow=r'.*/products\.php\?product=.*', restrict_xpaths=['//*[@id="frmCompare"]/ul'], unique=True) def start_requests(self): meta = {'index': 1, 'count': 0} yield Request(self.url.format(meta['index']), callback=self.parse_outer, meta=meta) def parse_outer(self, response): count = response.meta['count'] entries_links = self.extractor.extract_links(response) limit = self.MAX_ENTRIES - count limit = limit if limit > 0 else 0 for entry in entries_links[:limit]: yield Request(entry.url, callback=self.parse_coupon) count += len(entries_links) if len(entries_links) == 0 or count > self.MAX_ENTRIES: return meta = {'index': response.meta['index'] + 1, 'count': count} yield Request(self.url.format(meta['index']), callback=self.parse_outer, meta=meta) def parse_coupon(self, response): item = {'domain': self.domain, 'url': response.url} item['headline'] = response.xpath(self.headline_xpath).extract()[0] item['date'] = self.get_date(response) item['img'] = response.css(self.img_selector).xpath( self.img_xpath).extract()[0] yield item def get_date(self, response): date_grid = response.css(self.date_grid_selector) date_text = date_grid.xpath(self.date_xpath).extract()[0] try: return parse(date_text).date() except TypeError: return None
def parse_region(self, response): print('parse_region response.url:' + response.url) self.logger.debug('parse_region response.url:' + response.url) yield Request(response.url, callback=self.parse_list) le = LinkExtractor( restrict_css='.items-mod > div:nth-child(1) > div > div.sub-items') print('2' * 40) for link in le.extract_links(response): print(link, link.url, link.text) yield Request(link.url, callback=self.parse_list)
def parse(self, response): le = LinkExtractor() user_profiles = [] for link in le.extract_links(response): result = re.search(r'.*(http://www.last.fm/user/.*)', link.url) if result: user_profiles.append(result.group(1)) for user_profile in user_profiles: print user_profile
def parse(self, response): print('parse response.url:' + response.url) self.logger.debug('parse response.url:' + response.url) yield Request(response.url, callback=self.parse_list) le = LinkExtractor(restrict_css='div.screen_al > ul > li:nth-child(1) > ul') print('1' * 20) for link in le.extract_links(response): print(link, link.url, link.text) self.logger.debug(link) yield Request(link.url, callback=self.parse_region)
def parse_region(self, response): print('parse_region response.url:' + response.url) self.logger.debug('parse_region response.url:' + response.url) yield Request(response.url, callback=self.parse_list_decoration) le = LinkExtractor( restrict_css='#J_shopsearch > div:nth-child(2) > div > ul') print('2' * 100) for link in le.extract_links(response): print(link, link.url, link.text) yield Request(link.url, callback=self.parse_classfy)
def parse(self, response): item = PageItem() extractor = LinkExtractor(allow_domains='davidwatson.org') links = extractor.extract_links(response) item['url'] = response.url item['html'] = response.body item['links'] = [link.url for link in links] for link in links: yield scrapy.Request(link.url, callback=self.parse) yield item
def parse_second(self, response): print('2' * 20) first_category = response.meta.get('first_category') print('parse_second response.url:' + response.url) self.logger.debug('parse_second response.url:' + response.url) if response.xpath('//*[@id="navigation"]/ul/li[1]/div[1]/text()' ).extract_first() == '分类': le = LinkExtractor( restrict_xpaths= '//*[@id="navigation"]/ul/li[@dd_name="分类"]/div[2]/div[1]/div') for link in le.extract_links(response): print(link.url, link.text) self.logger.debug('second_category {},{}'.format( link.url, link.text)) second_category = link.text yield Request(link.url, callback=self.parse_third, meta={ 'first_category': first_category, 'second_category': second_category }) elif '价格' in response.xpath( '//*[@id="navigation"]/ul/li/@dd_name').extract(): le = LinkExtractor( restrict_xpaths= '//*[@id="navigation"]/ul/li[@dd_name="价格"]/div[2]/div[1]/div') for link in le.extract_links(response): print(link.url, link.text) # third_category = link.text yield Request(link.url, callback=self.parse_books, meta={'first_category': first_category}) else: le = LinkExtractor( restrict_xpaths= '//*[@id="navigation"]/ul/li[@dd_name="折扣"]/div[2]/div[1]/div') for link in le.extract_links(response): print(link.url, link.text) # third_category = link.text yield Request(link.url, callback=self.parse_books, meta={'first_category': first_category})
def parse_hall(self, response): print('parse_hall response.url:' + response.url) self.logger.debug('parse_hall response.url:' + response.url) yield Request(response.url, callback=self.parse_list) le = LinkExtractor(restrict_css='#list_D02_12 > ul') print('3' * 80) for link in le.extract_links(response): print(link, link.url, link.text) self.logger.debug(link) yield Request(link.url, callback=self.parse_list)
def parse_code(self, response): #提取source code的url # le = LinkExtractor(restrict_css='div.bodywrapper p', allow='matplotlib.org/examples') # link = le.extract_links(response) le = LinkExtractor(restrict_css='a.reference.external') link = le.extract_links(response) file = FilesItem() file['file_urls'] = [link[0].url] return file
def parse(self, response): name = 'example' lx = LinkExtractor() lst = lx.extract_links(response) # List contains the list of jobs # Call the function which compares between lst and MongoDB. Return Boolean Value flag = compare(name, lst) # if True, call the function which send an email to users if flag: notify(name) else: print("No Update")
def parse(self, response): link_extractor = LinkExtractor() links = link_extractor.extract_links(response) for link in links: item = DomainItem() item['link'] = link.url item['domain'] = self.getHost(link.url) yield item for link in links: if (not db.scrapy_items.find_one({'link': link.url})): yield scrapy.Request(link.url, callback=self.parse)
def parse(self, response): le = LinkExtractor() for link in le.extract_links(response): yield SplashRequest( link.url, self.parse_link, endpoint='render.json', args={ 'har': 1, 'html': 1, } )
def parse(self, response): e = LinkExtractor() urls = [link.url for link in e.extract_links(response)] for url in urls: if response.url != url: yield self.add_url(url) if urls: qs = urlparse.parse_qs(urlparse.urlparse(response.url).query) qs = dict((k, v[0]) for (k, v) in qs.iteritems()) qs['p'] = int(qs['p']) + 1 url = 'http://comeon5678.com/event/list' yield scrapy.Request('%s?%s' % (url, urllib.urlencode(qs)))
class GeneralSpider(Spider): name = "general" def __init__(self, *args, **kwargs): super(GeneralSpider, self).__init__(*args, **kwargs) self.le = LinkExtractor() def parse(self, response): if not isinstance(response, HtmlResponse): return for link in self.le.extract_links(response): r = Request(url=link.url) r.meta.update(link_text=link.text) yield r
class FundaSpider(CrawlSpider): name = "funda_spider" allowed_domains = ["funda.nl"] def __init__(self, place='amsterdam'): self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page_number) for page_number in range(1,301)] self.base_url = "http://www.funda.nl/koop/%s/" % place self.le1 = LinkExtractor(allow=r'%s+(huis|appartement)-\d{8}' % self.base_url) def parse(self, response): links = self.le1.extract_links(response) for link in links: if link.url.count('/') == 6 and link.url.endswith('/'): item = FundaItem() item['url'] = link.url if re.search(r'/appartement-',link.url): item['property_type'] = "apartment" elif re.search(r'/huis-',link.url): item['property_type'] = "house" yield scrapy.Request(link.url, callback=self.parse_dir_contents, meta={'item': item}) def parse_dir_contents(self, response): new_item = response.request.meta['item'] title = response.xpath('//title/text()').extract()[0] postal_code = re.search(r'\d{4} [A-Z]{2}', title).group(0) city = re.search(r'\d{4} [A-Z]{2} \w+',title).group(0).split()[2] address = re.findall(r'te koop: (.*) \d{4}',title)[0] price_dd = response.xpath("//dt[contains(.,'Vraagprijs')]/following-sibling::dd[1]/text()").extract()[0] price = re.findall(r' \d+.\d+', price_dd)[0].strip().replace('.','') year_built_dd = response.xpath("//dt[contains(.,'Bouwjaar')]/following-sibling::dd[1]/text()").extract()[0] year_built = re.findall(r'\d+', year_built_dd)[0] area_dd = response.xpath("//dt[contains(.,'Woonoppervlakte')]/following-sibling::dd[1]/text()").extract()[0] area = re.findall(r'\d+', area_dd)[0] rooms_dd = response.xpath("//dt[contains(.,'Aantal kamers')]/following-sibling::dd[1]/text()").extract()[0] rooms = re.findall('\d+ kamer',rooms_dd)[0].replace(' kamer','') bedrooms = re.findall('\d+ slaapkamer',rooms_dd)[0].replace(' slaapkamer','') new_item['postal_code'] = postal_code new_item['address'] = address new_item['price'] = price new_item['year_built'] = year_built new_item['area'] = area new_item['rooms'] = rooms new_item['bedrooms'] = bedrooms new_item['city'] = city yield new_item
def parse_main(self, response): le = LinkExtractor(allow=KADENCJA_RE) links = le.extract_links(response) self.print_links("kadencje", links) cnt = 0 for link in links: yield scrapy.Request(link.url, callback=self.parse_kadencja) k = items.PageItem() k["text"] = link.text.encode("utf8") k["url"] = link.url k["ref"] = response.url k["order"] = cnt yield k if cnt >= DEBUG_CNT and DEBUG: break cnt += 1
def print_url(self, response): """ @url http://www.ura.org.hk/en/schemes-and-policies/redevelopment/ura-implemented-projects/reimbursement.aspx @returns items 1 1 @returns requests 0 0 @scrapes title link html text last_updated file_urls """ l = ItemLoader(item=UrbanRenewalItem(), response=response) l.add_xpath('title', '//title') l.add_value('link', response.url) l.add_xpath('text', '//div[@id="content"]') l.add_xpath('html', '/html') l.add_xpath('last_updated', '//div[@class="lastUpdated"]') lx = LinkExtractor(allow=['\.' + ext for ext in file_extension], deny_extensions=()) l.add_value('file_urls', [link.url for link in lx.extract_links(response)]) return l.load_item()
def parse(self, response): for sel in response.css('article.product_pod'): book = BookstoresItem() book['name'] = sel.xpath('./h3/a/@title').extract_first() book['price'] = sel.css('p.price_color::text').extract_first() yield book # 提取链接 # next_url = response.css('ul.pager li.next a::attr(href)').extract_first() # if next_url: # next_url = response.urljoin(next_url) # yield scrapy.Request(next_url,callback=self.parse) le = LinkExtractor(restrict_css='ul.pager li.next' ) links = le.extract_links(response) if links: next_url = links[0].url yield scrapy.Request(next_url,callback=self.parse)
def parse_uchwala(self, response): # generate list of files to download le = LinkExtractor(allow=FindReportsSpider.PLIK_RE) links = le.extract_links(response) self.print_links("files", links) cnt = 0 for link in links: fi = items.FiledownloadItem() fi["file_urls"] = [link.url] fi["text"] = link.text.encode("utf8") fi["url"] = link.url fi["ref"] = response.url fi["order"] = cnt yield fi if cnt >= DEBUG_CNT and DEBUG: break cnt += 1
def parse(self, response): print(response.url) # Extract internal links from webpage IGNORED_EXTENSIONS.append('gz') IGNORED_EXTENSIONS.append('tar') urlextract = LinkExtractor(allow_domains=self.allowed_domains) # Store internal links links = urlextract.extract_links(response) links = [l.url for l in links] if response.url not in self.data: self.data[response.url] = links yield # Follow internal links for url in links: yield scrapy.Request(url, self.parse)
def parse_kadencja(self, response): # 'LIX Sesja Rady Miasta 24 września 2014 r.' # 'http://www.bip.olsztyn.eu/bip/dokument/305103/lix_sesja_rady_miasta_24_wrzesnia_2014_r_/' le = LinkExtractor(allow=FindReportsSpider.SESJA_RE) links = le.extract_links(response) self.print_links("sesje", links) cnt = 0 for link in links: yield scrapy.Request(link.url, callback=self.parse_sesja) k = items.PageItem() k["text"] = link.text.encode("utf8") k["url"] = link.url k["ref"] = response.url k["order"] = cnt yield k if cnt >= DEBUG_CNT and DEBUG: break cnt += 1
def parse_item(self, response): self.write_response(response.url, response) print("----------------------------------", response.real_url, response.url) le = LinkExtractor() for link in le.extract_links(response): splashRequestObj = SplashRequest( link.url, self.parse_item, endpoint='render.html', args={ 'wait':0.8, 'html': 1, } ) yield splashRequestObj
def parse(self, response): self.write_response(response.url, response) if not response.url.lower().find(r"cisco.com/en/us/docs") == -1 or not response.url.lower().find(r"cisco.com/c/en/us/td/docs") == -1 or not response.url.lower().find(r"register") == -1: return le = LinkExtractor() for link in le.extract_links(response): splashRequestObj = SplashRequest( link.url, self.parse, endpoint='render.html', args={ 'wait':0.8, 'html': 1, } ) yield splashRequestObj
class NumberOfPagesSpider(CrawlSpider): name = "number_of_pages" allowed_domains = ["funda.nl"] def __init__(self, place='amsterdam'): self.start_urls = ["http://www.funda.nl/koop/%s/" % place] self.le_maxpage = LinkExtractor(allow=r'%s+p\d+' % self.start_urls[0]) rules = (Rule(self.le_maxpage, ),) def parse(self, response): links = self.le_maxpage.extract_links(response) max_page_number = 0 # Initialize the maximum page number for link in links: if link.url.count('/') == 6 and link.url.endswith('/'): # Select only pages with a link depth of 3 page_number = int(link.url.split("/")[-2].strip('p')) # For example, get the number 10 out of the string 'http://www.funda.nl/koop/amsterdam/p10/' if page_number > max_page_number: max_page_number = page_number # Update the maximum page number if the current value is larger than its previous value filename = "max_pages.txt" # File name with as prefix the place name with open(filename,'wb') as f: f.write('max_page_number = %s' % max_page_number) # Write the maximum page number to a text file
class GeneralSpider(Spider): name = 'general' def __init__(self, *args, **kwargs): super(GeneralSpider, self).__init__(*args, **kwargs) f = open("seeds_es_smp.txt") la = [urlparse(url.strip()).netloc for url in f.readlines()] f.close() self.la = la self.le = LinkExtractor() def parse(self, response): if not isinstance(response, HtmlResponse): return for link in self.le.extract_links(response): netloc = urlparse(link.url).netloc if netloc in self.la: r = Request(url=link.url) r.meta.update(link_text=link.text) yield r
def parse_item(self, response): internal_item = InternalItem() internal_item["url"] = response.url yield internal_item #Use the inbuilt LinkExtractor to find urls, filtering out internal urls extractor_external = LinkExtractor(deny_domains=self.allowed_domains) external_links = extractor_external.extract_links(response) for link in external_links: external_item = ExternalItem() external_item["url"] = link.url yield external_item for src in response.css("img::attr('src')"): asset_item = AssetItem() asset_item["url"] = response.urljoin(src.extract()) yield asset_item for src in response.css("script::attr('src')"): asset_item = AssetItem() asset_item["url"] = response.urljoin(src.extract()) yield asset_item
def extract_links(response, xpaths, tag=None, attr=None): """Extract links on a page matching given XPaths. :param response: Scrapy response whose body contains links to extract :type response: :class:`scrapy.http.Response` :param xpaths: unique or iterable of XPath(s) matching links to extract :type xpaths: `unicode` or `iterable` of `unicode` :param tag: tag name from which extract links :type tag: `unicode` :param attr: attribute name in :data:`tag` tag from which extract links :type attr: `unicode` :yield: extracted links (canonicalized URLs), directly usable as :data:`scrapy.http.Request.url` parameters :rtype: `generator` orf `unicode` """ # Construct LinkExtractor parameters extractor_attrs = { 'restrict_xpaths': xpaths, 'canonicalize': True, } if tag: extractor_attrs['tags'] = (tag,) if attr: extractor_attrs['attrs'] = (attr,) # Extract links link_extractor = \ LinkExtractor(**extractor_attrs) links = link_extractor.extract_links(response) # Generate links for link in links: yield link.url