class RawHtmlSpider(CrawlSpider): name = 'yahoo_news' allowed_domains = ['news.yahoo.co.jp', 'headlines.yahoo.co.jp'] start_urls = [ # 'http://news.yahoo.co.jp/list/?c=domestic', # 'http://news.yahoo.co.jp/list/?c=world', # 'http://news.yahoo.co.jp/list/?c=economy', # 'http://news.yahoo.co.jp/list/?c=entertainment', # 'http://news.yahoo.co.jp/list/?c=sports', # 'http://news.yahoo.co.jp/list/?c=computer', # 'http://news.yahoo.co.jp/list/?c=science', 'http://news.yahoo.co.jp/list/?c=local', ] rules = [ Rule( LxmlLinkExtractor(restrict_xpaths="//a[@class='next']", unique=True)), Rule(LxmlLinkExtractor(allow=(r'pickup'), unique=True)), Rule(LxmlLinkExtractor( restrict_xpaths="//div[@class='headlineTxt']/a[@class='newsLink']" ), callback="parse_article"), ] def parse_article(self, response): htmlRes = HtmlResponse(url=response.url, body=response.body) item = RawHtmlItem() item["url"] = htmlRes.url item["body"] = htmlRes.body item["encoding"] = htmlRes.encoding return item
def __init__(self, **kw): super(TheCrawler, self).__init__(**kw) self.channel = kw.get('channel') self.domain = kw.get('domain') full_config_path = '%s%s' % (SITE_CONFIG_PATH, self.channel) self.config_path = "%s/%s.txt" % (full_config_path, self.domain) self.config_items = self._parse_config(self.config_path) try: self.url = self.config_items['start_url'] except KeyError: self.url = 'http://%s/' % self.domain try: self.link_extractor = LxmlLinkExtractor(restrict_xpaths=self.config_items['crawl_areas'], unique=True) except KeyError: self.link_extractor = LxmlLinkExtractor(unique=True) self.real_domain = urlparse(self.url).hostname.lstrip('www.') self.allowed_domains = [urlparse(self.url).hostname.lstrip('www.')] self.cookies_seen = set()
class HabrSpider(CrawlSpider): name = 'habr' allowed_domains = ['habrahabr.ru'] start_urls = ['http://habrahabr.ru/'] rules = ( Rule(LxmlLinkExtractor(restrict_xpaths=('.//h1/a[@class="post_title"]')), callback='parse_item'), Rule(LxmlLinkExtractor(restrict_xpaths=('.//*[@id="nav-pages"]/li/a')), follow=True), ) def __init__(self, category=None, *args, **kwargs): super(HabrSpider, self).__init__(*args, **kwargs) log.ScrapyFileLogObserver(open('debug.log', 'w'), level=log.DEBUG).start() log.ScrapyFileLogObserver(open('error.log', 'w'), level=log.ERROR).start() def parse_item(self, response): xpath = './/div[@class="content_left"]' sel = response.xpath(xpath) if not sel: return l = ItemLoader(item=HabrahabrItem(), selector=sel, response=response) l.add_xpath('title', '//h1/span/text()') l.add_xpath('image_urls', '//div[@class="content html_format"]/img/@src') comments_items = [] comments = sel.xpath('//div[starts-with(@class, "message html_format")]').extract() for comment in comments: comment_item = ItemLoader(item=HabrahabrComment(), selector=sel, response=response) comment_item.add_value('comment', comment) comments_items.append(comment_item.load_item()) l.add_value('comments', comments_items) yield l.load_item()
class YellowSpider(CrawlSpider): name = 'yellow' allowed_domains = ['yellow.co.nz'] # rules = (Rule(LxmlLinkExtractor(allow=(r'\/([A-Z])([A-Z0-9]{9})'),deny=('')),callback='parse_item'),Rule(LxmlLinkExtractor(allow=(''))),),) rules = (Rule( LxmlLinkExtractor( allow=(r'https://yellow.co.nz/canterbury-region/plumbers/page/.*', r'.*what=plumbers&where=Canterbury+Region.*')), follow=True, ), Rule( LxmlLinkExtractor(allow=(r'https://yellow.co.nz/y/.*'), deny=(r'.*more', r'.*Other')), callback='parse_business', follow=False, ), Rule( LxmlLinkExtractor(allow=('')), follow=False, )) def __init__(self, *args, **kwargs): super(YellowSpider, self).__init__(*args, **kwargs) start_url = 'https://yellow.co.nz/canterbury-region/plumbers/page/1?what=plumbers&where=Canterbury+Region' self.start_urls = [start_url] def parse_business(self, response): item = YellowItem() print "\n\n---------------------START-----------------------" print "\n\n---------------------START-----------------------" print "\n\n---------------------START-----------------------" print response.url item["Company"] = response.xpath( '//*[@id="businessDetailsPrimary"]/div[1]/div[3]/h1/span').extract( ) item["PhoneNumber"] = response.xpath( '//*[@id="businessDetailsPrimary"]/div[2]/div/span[1]/a[1]' ).extract() item["MailingAddress"] = response.xpath( '//*[@id="detailSectionSecondary"]/div[2]/section[3]/div[2]/p' ).extract() item["email"] = response.xpath( '//*[@id="businessDetailsPrimary"]/div[2]/div/meta').extract() item["url"] = response.url print item yield item def process_links(self, links): print "\n LINKS" links_list = [] for i in links: if "https://www.tripadvisor.com/Attraction_Review" in i.url: links_list.append(i) print i.url return links_list
class AmazonSpider(CrawlSpider): name = 'aragog' allowed_domains = ['amazon.in'] rules = ( Rule(LxmlLinkExtractor(allow=(r'\/([A-Z])([A-Z0-9]{9})'), deny=(r'product\-reviews', r'offer\-listing', r'ebook')), callback='parse_item'), Rule(LxmlLinkExtractor(allow=(''))), ) def __init__( self, start_url='http://www.amazon.in/Laptops/b/ref=nav_shopall_computers_laptop?ie=UTF8&node=1375424031', *args, **kwargs): super(AmazonSpider, self).__init__(*args, **kwargs) self.start_urls = [start_url] def parse_item(self, response): # print(str(response.url)) item = AmazonscrapingMongodbItem() try: name = response.xpath( '//*[@id="productTitle"]/text()').extract()[0].encode( 'ascii', 'ignore') item['name'] = name.strip().split("\n") item['reviews'] = response.xpath( '//*[@id="acrCustomerReviewText"]/text()').extract()[0].encode( 'ascii', 'ignore') item['url'] = response.url # print(response.xpath('//*[@id="avgRating"]/span/text()').extract()) item['rating'] = response.xpath( '//*[@id="avgRating"]/span/text()').extract()[0].encode( 'ascii', 'ignore').replace('\n', ' ').strip() item['pid'] = response.url.split('/ref=')[0].split('/')[-1].encode( 'ascii', 'ignore') item['price'] = [ response.xpath( '//*[@id="price"]/table//span[starts-with(@id,"priceblock")]//text()' ).extract()[1].encode('ascii', 'ignore').strip() ] item['desc'] = [ desc.encode('ascii', 'ignore') for desc in response.xpath( '//*[@id="feature-bullets"]/ul/li/span/text()').extract() ] item['timestamp'] = [str(datetime.datetime.now())] print(item) except: print('Not a product!') item = None yield item def dummy(self, response): print(str(response.url))
class SearchdisconnectSpider(CrawlSpider): name = "searchdisconnect" allowed_domains = ["https://search.disconnect.me/"] start_urls = ('https://search.disconnect.me/', ) rules = [ Rule(LxmlLinkExtractor(restrict_xpaths=["//div[@class='pagination']"]), callback="parse_links") ] ITEM_CLASS = SearchdisconnectcrawlerItem def __init__(self, keyword="p**n", *args, **kwargs): super(SearchdisconnectSpider, self).__init__(*args, **kwargs) self._query = keyword def parse_start_url(self, response): return FormRequest.from_response(response, formdata={"query": self._query}, callback=self.parse_links) def parse_links(self, response): item = SearchdisconnectcrawlerItem() item["keyword"] = self._query urls = response.css("a.title::attr(href)").extract() for url in urls: item["url"] = url yield item pagination_links = response.css( "div.pagination a::attr(href)").extract() for link in pagination_links: yield Request(self.start_urls[0] + "searchTerms/" + link[2:], callback=self.parse_links)
class LbColdDriedFruits(LocalBanya2Crawler): name = "lb_colddriedfruits" start_urls = ['http://www.localbanya.com/products/Fruits-&-Vegetables/Cold-Dried-Fruits/180/234'] rules = ( Rule(LxmlLinkExtractor(allow='product-details/Fruits---Vegetables/Cold-Dried-Fruits'), callback='parse_product', follow=True), )
class SwhSpider(CrawlSpider): name = 'swh' allowed_domains = ['www.smh.com.au'] start_urls = ['http://www.smh.com.au/'] '''def start_requests(self): for url in self.start_urls: yield SplashRequest(url, self.parse,meta={ 'splash':{ 'endpoint':'render.html', 'args':{'wait': 0.5}, } })''' def splash_request(self, request): return SplashRequest(request.url, self.parse_page, args={ 'wait': 10, 'timeout': 3600 }, meta={'real_url': request.url}) rules = (Rule(LxmlLinkExtractor(allow=(), deny=()), callback="parse_page", process_request="splash_request", follow=True), ) def parse_page(self, response): t = str(response.css('title::text').extract()[0]) nt = t + '.text' c = ' '.join(response.css('._1665V').xpath( './/p//text()').extract()).encode('utf-8') if c: with open(os.path.join(dest, nt), 'wb') as f: f.write(c) yield {'title': t}
class CalendarSpiderSpider(CrawlSpider): name = "calendar-spider" allowed_domains = ["uvic.ca"] start_urls = [ 'http://web.uvic.ca/calendar2015-09/CDs/CSC/CTs.html', 'http://web.uvic.ca/calendar2015-09/CDs/CSC/466.html' ] #(http://web.uvic.ca/calendar2015-09/CDs/)(CSC|MATH|SENG).+') rules = [ Rule(LxmlLinkExtractor( allow=('(http://web.uvic.ca/calendar2015-09/CDs/).+'), restrict_xpaths=('//div[@id="CDpage"]', '//ul[@class="CDTL"]')), callback='parse_item', follow=True) ] def parse_item(self, response): item = WebItem() item['title'] = response.xpath( '//title/text()').extract().pop().encode('utf-8') item['url'] = response.url return item
class MySpider(Spider): name = 'example' link_extractor = LxmlLinkExtractor() def parse(self, response): soup = BeautifulSoup(response.body) for script in soup(["script", "style"]): script.extract() text = soup.get_text() response.meta.update(score=KeywordScorer.score(text)) response.meta.update(content_hash=xxhash.xxh64( text.encode('ascii', 'ignore')).intdigest()) for link in self.link_extractor.extract_links(response): request = Request(url=link.url) request.meta.update(link_text=link.text) link_score = KeywordScorer.score(link.text) request.meta.update(score=link_score) yield request @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) return spider def spider_idle(self): raise DontCloseSpider
class WwwExpansysComSgCrawler(CrawlSpider): name = 'www_expansys_com_sg_crawler' allowed_domains = ['expansys.com.sg'] start_urls = [ 'http://www.expansys.com.sg/' ] rules = [ Rule(LinkExtractor(allow='page=\d+#listing'),follow=True,), Rule(LxmlLinkExtractor(allow=(r'.+/\S+\d+/',),deny = (r'.+/.filter',)), callback = 'parse_item', follow=True,) ] def parse_item(self,response): items = list() for sel in response.xpath('//div[@id="product"]'): item = ExpansysItem() item['url'] = response.url or None item['sku'] = sel.xpath('//div[@id="prod_core"]/ul/li[1]/span/text()').extract() item['title'] = sel.xpath('//div[@id="title"]/h1/text()').extract() item['description'] = sel.xpath('//div[@id="description"]/h2/text()').extract() item['price'] = sel.xpath('//div[@id="prod_core"]/span/ul[@class="details"]/li[@class="price"]/p[@id="price"]/strong/span/text()').extract() item['ean'] = sel.xpath('//div[@id="prod_core"]/ul/li[2]/span/text()').extract() item['mpn'] = sel.xpath('//div[@id="prod_core"]/ul/li[3]/span/text()').extract() item['brand'] = sel.xpath('//div[@id="prod_core"]/ul/li[4]/a/text()').extract() item['currency'] = sel.xpath('//p[@id="price"]/meta/@content').extract() item['img_urls'] = sel.xpath('//div[@id="prod_left"]/div[2]/a/img/@src').extract() item['categories'] = sel.xpath('//li[@id="n_audio"]/div/div[1]/ul/li/a/text()').extract() item['availability'] = sel.xpath('//li[@id="stock"]/text()').extract() item['rating'] = sel.xpath('//div[@id="review_avg"]/span[1]/text()').extract() items.append(item) yield item
class DmozSpiderSpider(CrawlSpider): name = "dmoz-spider" allowed_domains = ["dmoz.org"] start_urls = [ 'http://www.dmoz.org/Computers/', 'http://www.dmoz.org/Society/', 'http://www.dmoz.org/Sports/' ] rules = [ Rule(LxmlLinkExtractor(allow=('(http://www.dmoz.org/).+')), callback='parse_item', follow=True) ] #LOG_FILE = "data/scrapy_%s.log" % datetime.now().strftime('%Y-%m-%dZ%H-%M') #logfile = open(LOG_FILE, 'w') #log_observer = ScrapyFileLogObserver(logfile, level=logging.INFO) #log_observer.start() def parse_item(self, response): item = WebItem() item['title'] = response.xpath( '//title/text()').extract().pop().encode('utf-8') item['url'] = response.url return item
class CuponomiaScrapper(BaseCouponsCrawler): """crawler for site http://www.coupondunia.in""" name = 'test' allowed_domains = ["promotionalcodes.com"] start_urls = ['http://www.promotionalcodes.com/stores-by-letter/m'] rules = [ Rule( LxmlLinkExtractor(allow=( # '1-800-mobiles-coupons', 'http://promotionalcodes.com/macys-coupons')), callback='parse_items', follow=False) ] def __init__(self, *args, **kwargs): super(CuponomiaScrapper, self).__init__(*args, **kwargs) store_name_path = '//*[@id="bodywrap"]/div/div[3]/div[1]/h2/a/text()' store_homepage_path = '//*[@id="bodywrap"]/div/div[3]/div[1]/h2/a/@href' store_logo_path = '//*[@id="bodywrap"]/div/div[1]/div/div/img/@src' store_description_path = '//*[@id="bodywrap"]/div/div[3]/div[1]/p/text()' coupon_name_path = './h3/a/text()' coupon_description_path = './p/text()' coupon_code_path = '/html/body/div[3]/div[1]/div[5]/div[2]/div[1]/div/article[1]/div[1]/div[1]/div[2]/div[2]/p/a/span[2]/text()' coupons_selector_css = 'div.coupon_box.widecoupon:not(.widecoupon-expired):not(.widecoupon-addCouponForm) > div.coupon_content > div.coupon_main_column'
class DmozSpider(CrawlSpider): name = "buzzfeedNews" allowed_domains = ["buzzfeed.com"] start_urls = ["http://www.buzzfeed.com/"] rules = ( # Extract links matching 'item.php' and parse them with the spider's method parse_item Rule(LxmlLinkExtractor(allow_domains=('buzzfeed.com')), callback='parse_item'), ) def parse_item(self, response): items = [] depth = response.meta["depth"] referring_url = response.request.headers.get('Referer', None) current_url = response.url title = response.xpath('//div[@id="buzz_header"]//h1/text()').extract() for link in response.xpath( '//div[@id="buzz_sub_buzz"]//div[not(contains(@class,"share-box"))]//a[not(@rel="nofollow")]/@href[not(contains(text(),"buzzfeed") or contains(text(),"buzzfed"))]' ): l = link.extract() if str(l) != "javascript:;": item = BuzzlinksItem() item["depth"] = depth item["current_url"] = current_url item["referring_url"] = referring_url item["link"] = link.extract() item["article_title"] = title parsed_uri = urlparse(link.extract()) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) item["link_domain"] = domain items.append(item) return items
class CouponduniaScrapper(BaseCouponsCrawler): """crawler for site http://www.coupondunia.in""" name = 'coupondunia' allowed_domains = ["coupondunia.in"] start_urls = ['http://www.coupondunia.in/stores'] rules = [ Rule( LxmlLinkExtractor( allow=( # '1-800-mobiles-coupons', # 'http://promotionalcodes.com/macys-coupons', # 'zoffio', # 'ebay', ), deny=('coupondunia.in/stores'), restrict_xpaths=('/html/body/div[2]/div', )), callback='parse_items', follow=False) ] def __init__(self, *args, **kwargs): super(CouponduniaScrapper, self).__init__(*args, **kwargs) store_name_path = '/html/body/div[2]/div/div/div[2]/div[1]/h1/span[1]/text()' store_homepage_path = '/html/body/div[2]/div/div/div[1]/div/div[1]/div/div/div/a/@href' store_logo_path = '/html/body/div[2]/div/div/div[1]/div/div[1]/div/div/img/@src' # store_description_path = '//*[@id="bodywrap"]/div/div[3]/div[1]/p/text()' coupon_name_path = './div/div[@class="offer-title"]/a/text()' coupon_description_path = './div/div[@class="offer-description-full"]/text()' coupon_code_path = './div/div[@class="offer-getcode"]/div/@data-code' coupons_selector_css = 'html body div.page-content.dark div.container div.row div.col-19 div#coupon_container.row.margin-left-right-none div.offer-big.offer.sms-parent.col-24'
class ActualSpider(CrawlSpider): name = 'newsite' allowed_domains = ['edition.cnn.com', 'economictimes.indiatimes.com'] start_urls = [ 'https://edition.cnn.com/', 'https://economictimes.indiatimes.com/' ] '''def abs_link(value): return urlparse.urljoin(response.url, value.strip())''' #Note:callback function name should always be something different from parse rules = (Rule(LxmlLinkExtractor( allow=('https://edition.cnn.com/', 'https://economictimes.indiatimes.com/'), deny=('https://plus.google.com/', )), callback="parse_page", follow=True), ) def parse_page(self, response): site = response.meta['download_slot'] t = str(response.css('title::text').extract()[0]) nt = t + '.text' if site == "edition.cnn.com": sel = '.zn-body__paragraph *::text' dest = '/home/pannaga/work/extraction/extraction/CNN' elif site == "economictimes.indiatimes.com": sel = '.Normal *::text' dest = '/home/pannaga/work/extraction/extraction/ET' c = ' '.join(response.css(sel).extract()).encode('utf-8') if c: with open(os.path.join(dest, nt), 'w') as f: f.write(c) #yield {'title':t} yield {'desti': dest}
class LbFruits(LocalBanya2Crawler): name = "lb_exoticfruitsveg" start_urls = [ 'http://www.localbanya.com/products/Fruits-&-Vegetables/Exotic-Fruits-&-Vegetables/180/55' ] rules = (Rule( LxmlLinkExtractor(allow='product-details/Fruits---Vegetables/Fruits-'), callback='parse_product', follow=True), )
class Century21Spider(CrawlSpider): name = 'century21' allowed_domains = ['century21.fr'] start_urls = [ URL_TEMPLATE % postcode for postcode in "75010 75011 75012 75018 75019 75020".split(' ') ] regex = r'http://www\.century21\.fr/trouver_logement/detail/\d+/' f = lambda link: re.match( r'(http://www\.century21\.fr/trouver_logement/detail/\d+/).*').groups( )[0] rules = [ Rule(LinkExtractor(allow=regex), 'parse_ad'), Rule( LxmlLinkExtractor( allow='.*', restrict_xpaths= "//div[contains(@class,'btnSUIV_PREC suivant')]/a[contains(text(), 'suivant')]", process_value=f)) ] def parse_ad(self, response): pty = Property() pty['url'] = response.url pty['listed_on'] = self.name # Price price = ' '.join( response.css('section.tarif span b').xpath('text()').extract()) pty['price'] = int( re.sub('\s+', ' ', price).replace( u'\xa0', '').rstrip().strip(u' \u20ac').replace(' ', '')) # Surface details = ' '.join( response.css('section.precision p').xpath("text()").extract()) pty['size'] = float( re.search(r'(\d+,?\d*) ?[mM][2\xb2]', details).groups()[0].replace(',', '.')) # Post code filariane = ' '.join( response.css('div#filAriane div a span').xpath("text()").extract()) pty['postcode'] = int(re.search(r'(750\d{2})', filariane).groups()[0]) # Content pty['title'] = ' '.join( response.css('h1.h1_page').xpath('text()').extract()) pty['description'] = ' '.join( response.css('div#descTextAnnonce.descriptionLongue p').xpath( 'text()').extract()) # Price per square meter pty['ppsqm'] = float(pty['price']) / pty['size'] return pty
class LbLeafies(LocalBanya2Crawler): name = "lb_leafies" start_urls = [ 'http://www.localbanya.com/products/Fruits-&-Vegetables/Leafies/180/244' ] rules = (Rule( LxmlLinkExtractor(allow='product-details/Fruits---Vegetables/Leafies'), callback='parse_product', follow=True), )
class NeteaseNewsSpider(CrawlSpider): name = "netease_news_spider" allowed_domains = ['news.163.com'] start_urls = ['http://news.163.com/'] # http://news.163.com/17/0823/20/CSI5PH3Q000189FH.html url_pattern = r'(http://news\.163\.com)/(\d{2})/(\d{4})/\d+/(\w+)\.html' rules = [ Rule(LxmlLinkExtractor(allow=[url_pattern]), callback='parse_news', follow=True) ] def parse_news(self, response): sel = Selector(response) pattern = re.match(self.url_pattern, str(response.url)) source = 'news.163.com' if sel.xpath('//div[@class="post_time_source"]/text()'): time = sel.xpath('//div[@class="post_time_source"]/text()' ).extract_first().split()[0] + ' ' + sel.xpath( '//div[@class="post_time_source"]/text()' ).extract_first().split()[1] else: time = 'unknown' date = '20' + pattern.group(2) + '/' + pattern.group( 3)[0:2] + '/' + pattern.group(3)[2:] newsId = pattern.group(4) url = response.url title = sel.xpath("//h1/text()").extract()[0] contents = ListCombiner(sel.xpath('//p/text()').extract()[2:-3]) comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/{}'.format( newsId) yield Request(comment_url, self.parse_comment, meta={ 'source': source, 'date': date, 'newsId': newsId, 'url': url, 'title': title, 'contents': contents, 'time': time }) def parse_comment(self, response): result = json.loads(response.text) item = NewsItem() item['source'] = response.meta['source'] item['date'] = response.meta['date'] item['newsId'] = response.meta['newsId'] item['url'] = response.meta['url'] item['title'] = response.meta['title'] item['contents'] = response.meta['contents'] item['comments'] = result['cmtAgainst'] + result['cmtVote'] + result[ 'rcount'] item['time'] = response.meta['time'] return item
def parse_item(self, response): sel = Selector(response) items = LinkParser.extract_page_links(sel) num_onsite_links = 0 num_offsite_links = 0 page_id = ObjectId() for item in items: item['page_id'] = page_id item['domain'] = "" item['org_id'] = self.org item['referer'] = response.meta.get('Referer') if 'uri' in item: parse_uri = urlparse(item['uri']) item['domain'] = parse_uri[1] item['onsite'] = False for dom in self.allowed_domains: if item['domain'] == "" or item['domain'] in dom: item['onsite'] = True num_onsite_links = num_onsite_links + 1 if item['onsite'] == False: num_offsite_links = num_offsite_links + 1 yield item page = LinkParser.get_page_data(response) page['page_id'] = page_id page['useragent'] = response.meta.get('User-Agent') page['referer'] = response.meta.get('Referer') page['org_id'] = self.org page['num_offsite_links'] = num_offsite_links page['num_onsite_links'] = num_onsite_links yield page #limit page depth if self.pages_crawled >= settings.PAGES_PER_DOMAIN: return for link in LxmlLinkExtractor( unique=True, allow_domains=self.allowed_domains).extract_links(response): if not link.url in self.already_crawled and self.pages_crawled <= settings.PAGES_PER_DOMAIN: self.already_crawled.add(link.url) self.pages_crawled = self.pages_crawled + 1 print "yielding request for ", link.url yield WebdriverRequest(link.url, callback=self.parse_item) elif self.pages_crawled >= settings.PAGES_PER_DOMAIN: print "reached max crawl" return else: print "avoiding duplicate request for: ", link.url
class NbadraftSpider(CrawlSpider): name = "nbadraftnet" allowed_domains = ["nbadraft.net"] start_urls = ( 'http://www.nbadraft.net/articles', ) calendar = parsedatetime.Calendar() rules = ( Rule(LxmlLinkExtractor(restrict_xpaths="//div[@id='content']//td/a"), callback='parse_article'), Rule(LxmlLinkExtractor(allow='/articles')), Rule(LxmlLinkExtractor(allow='/players/'), callback='parse_article', cb_kwargs={'base_relevance': 100}) ) def parse_article(self, response, **kwargs): relevance = kwargs.get('base_relevance',0) content_selector = response.css('#content-area .content') images = content_selector.xpath("//img/@src").extract() base = get_base_url(response) #Fri, 07/27/2012 - 4:16pm parsed = self.calendar.parse(response.css('.date::text').extract()[0]) date = datetime.datetime(*parsed[0][:7]) main_content = content_selector.extract()[0] #replace relative image links for link in images: if link[0] == '/': main_content.replace(link,base + link) title = response.xpath('//h1/text()').extract()[0] yield ArticleItem( title=title, date=date, content=main_content, relevance=relevance, url=response.url )
class TheSpider(CrawlSpider): name = 'khana' allowed_domains = ['fr.khanacademy.org'] start_urls = ['https://fr.khanacademy.org'] rules = (Rule(LxmlLinkExtractor(allow_domains=(['fr.khanacademy.org/math/','fr.khanacademy.org/science/','fr.khanacademy.org/computing/'])), callback='parse_url', follow=True), ) def parse_url(self, response): item=ScraperItems() item['links']=response.xpath('//a[contains(@class, "topic-list-item")]/@href').extract() + response.xpath('//link[contains(@rel, "image_src")]/@href').extract() for i in item['links']: return Request(urlparse.urljoin('response.url', i[1:]))
class SinaNewsSpider(CrawlSpider): name = "sina_news_spider" allowed_domains = ['news.sina.com.cn'] start_urls = ['http://news.sina.com.cn'] # http://finance.sina.com.cn/review/hgds/2017-08-25/doc-ifykkfas7684775.shtml # url_pattern = r'(http://(?:\w+\.)*news\.sina\.com\.cn)/.*/(\d{4}-\d{2}-\d{2})/doc-(.*)\.shtml' today_date = time.strftime('%Y-%m-%d',time.localtime(time.time())) url_pattern = r'(http://(?:\w+\.)*news\.sina\.com\.cn)/.*/({})/doc-(.*)\.shtml'.format(today_date) rules = [ Rule(LxmlLinkExtractor(allow=[url_pattern]), callback='parse_news', follow=True) ] def parse_news(self, response): sel = Selector(response) if sel.xpath("//h1[@id='artibodyTitle']/text()"): title = sel.xpath("//h1[@id='artibodyTitle']/text()").extract()[0] pattern = re.match(self.url_pattern, str(response.url)) source = 'sina' date = pattern.group(2).replace('-','/') if sel.xpath('//span[@class="time-source"]/text()'): time_ = sel.xpath('//span[@class="time-source"]/text()').extract_first().split()[0] else: time_ = 'unknown' newsId = pattern.group(3) url = response.url contents = ListCombiner(sel.xpath('//p/text()').extract()[:-3]) comment_elements = sel.xpath("//meta[@name='sudameta']").xpath('@content').extract()[1] comment_channel = comment_elements.split(';')[0].split(':')[1] comment_id = comment_elements.split(';')[1].split(':')[1] comment_url = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel={}&newsid={}'.format(comment_channel,comment_id) yield Request(comment_url, self.parse_comment, meta={'source':source, 'date':date, 'newsId':newsId, 'url':url, 'title':title, 'contents':contents, 'time':time_ }) def parse_comment(self, response): if re.findall(r'"total": (\d*)\,', response.text): comments = re.findall(r'"total": (\d*)\,', response.text)[0] else: comments = 0 item = NewsItem() item['comments'] = comments item['title'] = response.meta['title'] item['url'] = response.meta['url'] item['contents'] = response.meta['contents'] item['source'] = response.meta['source'] item['date'] = response.meta['date'] item['newsId'] = response.meta['newsId'] item['time'] = response.meta['time'] return item
def parse(self, response): items = [] for link in LxmlLinkExtractor(allow=self.allowed_domains).extract_links(response): item = Gentest1Item() item['url'] = link.url item['document_name'] = response.meta['document_name'] items.append(item) requests = self.make_requests_from_url(link.url) requests.meta['document_name'] = response.meta['document_name'] items.append(requests) return items
class PAPSpider(CrawlSpider): name = 'pap' allowed_domains = ['pap.fr'] start_urls = [ URL_TEMPLATE % (37767 + int(postcode)) for postcode in "10 11 12 18 19 20".split(' ') ] regex = r'http://www.pap.fr/annonce/vente-appartements-paris-.*-r\d{9}' rules = [ Rule(LinkExtractor(allow=regex), 'parse_ad'), Rule( LxmlLinkExtractor( allow='.*', restrict_xpaths= "//ul[contains(@class,'pagination')]/li[contains(@class,'next')]/a[contains(text(), 'Suivante')]" )) ] def parse_ad(self, response): pty = Property() pty['url'] = response.url pty['listed_on'] = 'pap' # Price prices = response.css('h1 span.prix').xpath('text()').extract() assert len(prices) == 1 pty['price'] = int(prices[0].rstrip(u' $\u20ac').replace('.', '')) # Surface li = response.css('.footer-descriptif ul').xpath( "//li[contains(span//text(), 'Surface')]").xpath("text()") # >>> li.xpath("text()").extract() # [u'\n\t\t\t\t\t\t\t\t', u'\n\t\t\t\t\t\t\t\t40\xa0', u'\t\t\t\t\t\t\t'] assert len(li) == 3 pty['size'] = float(li[1].extract().strip()) # Post code titles = response.css('.text-annonce h2').xpath("text()").extract() assert len(titles) == 1 match = re.search(r'\d{5}', titles[0]) assert match is not None pty['postcode'] = int(match.group()) # Content pty['title'] = response.css('h1 span.title').xpath( 'text()').extract()[0] + ' - ' + titles[0] pty['description'] = ' '.join( response.css('div.text-annonce p').xpath('text()').extract()) # Price per square meter pty['ppsqm'] = float(pty['price']) / pty['size'] return pty
class recruitSpider(CrawlSpider): name = "tencentRecruitSpider" allowed_domains = ["tencent.com"] #爬虫的入口网页url start_urls = ["http://hr.tencent.com/position.php"] #根据任意一页职位url自定义爬取规则(http://hr.tencent.com/position.php?&start=1370#a) rules = [ Rule(LxmlLinkExtractor(allow=('/position.php\?&start=\d{,4}#a')), follow=True, callback='parseItem') ] #定义提取网页数据到Items中的实现函数 def parseItem(self, response): items = [] sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css('table.tablelist tr.even') for site in sites_even: item = PositionItem() item['name'] = site.css('.l.square a').xpath( 'text()').extract()[0].encode('gbk') relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['positionLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract() item['workPlace'] = site.css( 'tr > td:nth-child(4)::text').extract()[0] item['number'] = site.css( 'tr > td:nth-child(3)::text').extract()[0] item['releaseTime'] = site.css( 'tr > td:nth-child(5)::text').extract()[0] items.append(item) sites_odd = sel.css('table.tablelist tr.odd') for site in sites_odd: item = PositionItem() item['name'] = site.css('.l.square a').xpath('text()').extract()[0] relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['positionLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract() item['workPlace'] = site.css( 'tr > td:nth-child(4)::text').extract()[0] item['number'] = site.css( 'tr > td:nth-child(3)::text').extract()[0] item['releaseTime'] = site.css( 'tr > td:nth-child(5)::text').extract()[0] items.append(item) info('parsed ' + str(response)) return items def _process_request(self, request): info('process ' + str(request)) return request
def parse_page(self, response): # ------products_paths------ products_paths = response.xpath("//div[@id='articles']/div/a/@href").extract() for product_path in products_paths: item = ProductItem() item["path"] = product_path request = Request(url="http://www.madeleine.de" + product_path, callback=self.parse_product) request.meta['item'] = item yield request extr = LxmlLinkExtractor(allow="seite-\d+") links = extr.extract_links(response) for link in links: yield Request(url=link.url, callback=self.parse_page)
class GoogleSpider(CrawlSpider): name = 'google' allowed_domains = ['google.com'] # rules = (Rule(LxmlLinkExtractor(allow=(r'\/([A-Z])([A-Z0-9]{9})'),deny=('')),callback='parse_item'),Rule(LxmlLinkExtractor(allow=(''))),),) # rules = (Rule(LxmlLinkExtractor(allow=(r'https://www.tripadvisor.com/Attraction_Review.*')),callback='parse_trip', process_links='process_links'),) rules = (Rule(LxmlLinkExtractor(allow=(r'https://www.google.com/.*')),callback='parse_search'),Rule(LxmlLinkExtractor(allow=(''))),follow=False) def __init__(self,*args, **kwargs): super(TripadvisorSpider, self).__init__(*args, **kwargs) start_url='https://www.tripadvisor.com/Attractions-g187337-Activities-Frankfurt_Hesse.html' # start_url='https://www.tripadvisor.com/' self.start_urls = [start_url] def parse_trip(self,response): item = GoogleItem() print "\n\n---------------------START-----------------------" print response.url # print response.xpath('//a/@href').extract() # try: item['name'] = response.xpath('//*[@id="HEADING"]/text()').extract()[0].encode('ascii','ignore') # item['rating'] = parsing_rating(response.xpath('//*[@id="HEADING_GROUP"]/div/div[2]/div[1]/div/span/img').extract()) # item['neighborhood'] = response.xpath('//*[@id="MAP_AND_LISTING"]/div[2]/div/div[2]/div/div[1]/div/address/span/span').extract() # item['classification'] = response.xpath('//*[@id="HEADING_GROUP"]/div/div[3]/div[2]/div').extract() item['url'] = response.url # item['price'] = response.xpath('//*[@id="ABOVE_THE_FOLD"]/div[2]/div[1]/div/div[2]/div/div[1]/div/div[2]/div[1]/text()').extract() # item['hours'] = response.xpath('//*[@id="MAP_AND_LISTING"]/div[2]/div/div[2]/div/div[4]/div/div[2]/div').extract() # item['desc'] = response.xpath('//*[@id="OVERLAY_CONTENTS"]/div/p/text()').extract() # item['desc'] = [desc.encode('ascii','ignore') for desc in response.xpath('//*[@id="feature-bullets"]/ul/li/span/text()').extract() ] # usernames = response.xpath('//*[@class="username mo"]').extract() # reviews = response.xpath('//*[@class="partial_entry"]/text()').extract() # item['reviews'] = zip(usernames,reviews) print "\n\n---------------------------------------------------" print(item) # except: # print('Not a product!') # item = None yield item def process_links(self,links): print "\n LINKS" links_list = [] for i in links: if "https://www.tripadvisor.com/Attraction_Review" in i.url: links_list.append(i) print i.url return links_list def dummy(self,response): print(str(response.url))
def parse_page(self, response): # ------products_paths------ products_paths = response.xpath( "//div[@id='articles']/div/a/@href").extract() for product_path in products_paths: item = ProductItem() item["path"] = product_path request = Request(url="http://www.madeleine.de" + product_path, callback=self.parse_product) request.meta['item'] = item yield request extr = LxmlLinkExtractor(allow="seite-\d+") links = extr.extract_links(response) for link in links: yield Request(url=link.url, callback=self.parse_page)
class LittleBrotherSpider(CrawlSpider): name = "little_brother" allowed_domains = ["camara.gov.br"] start_urls = ( #This list should contain all the deputies from this period of thime (legislatura 54) -- change this number to get deputies from other periods 'http://www.camara.gov.br/internet/deputado/Dep_Lista.asp?Legislatura=54&Partido=QQ&SX=QQ&Todos=None&UF=QQ&condic=QQ&forma=lista&nome=&ordem=nome&origem=None', #Example: 'http://www.camara.leg.br/Internet/Deputado/dep_Detalhe.asp?id=141463', ) rules = ( Rule(LxmlLinkExtractor(allow=(".*Dep_Detalhe\.asp", ),), callback="parse_deputy", follow= True,),\ Rule(LxmlLinkExtractor(allow=(".*RelVotacoes\.asp", ),), callback="parse_voting", follow= True, ), ) def parse_deputy(self, response): basic_info = response.xpath("//div[@class='bloco clearedBox']/ul/li") item = DeputyItem() for sel in basic_info: strong = sel.xpath("strong/text()").extract() if strong and "nome civil" in strong[0].lower(): item["name"] = sel.xpath("text()").extract()[0].strip() if strong and "partido" in strong[0].lower(): info = sel.xpath("text()").extract()[0].strip().split("/") item["party"] = info[0].strip() item["state"] = info[1].strip() item["active"] = info[2].strip() gid = re.match(".*id=(?P<id>\w*)", response.url) item["deputy_id"] = gid.group("id") #item["deputy_register"] = yield item def parse_voting(self, response): print "Parsing:", response.url pass def filter_deputy(self, response): pass
class Jdv2Spider(CrawlSpider): name = "linkextrator" allowed_domains = ["lagou.com"] start_urls = ('http://www.lagou.com/zhaopin/Java?labelWords=label', ) rules = [ Rule(LxmlLinkExtractor(allow=("http://www.lagou.com/jobs/")), callback='parse_item', follow=True) ] def parse_item(self, response): item = CategoryItem() item['category'] = response.css(".job_bt").extract() return item
class LinkProcedure(BaseProcedure): """ 基于scrapy的LxmlLinkExtractor的链接提取器 link xpath xpath string|array 参考LxmlLinkExtractor的restrict_xpaths """ def __init__(self, *args): xpath = args[0] self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath) def do(self, input_, **kwargs): if isinstance(input_, Response): links = self._extractor.extract_links(input_) return [i.url.strip() for i in links] else: raise Exception('link input error')
class TheCrawler(Spider): name = 'TheCrawlerEngineV1' def __init__(self, **kw): super(TheCrawler, self).__init__(**kw) self.channel = kw.get('channel') self.domain = kw.get('domain') full_config_path = '%s%s' % (SITE_CONFIG_PATH, self.channel) self.config_path = "%s/%s.txt" % (full_config_path, self.domain) self.config_items = self._parse_config(self.config_path) try: self.url = self.config_items['start_url'] except KeyError: self.url = 'http://%s/' % self.domain try: self.link_extractor = LxmlLinkExtractor(restrict_xpaths=self.config_items['crawl_areas'], unique=True) except KeyError: self.link_extractor = LxmlLinkExtractor(unique=True) self.real_domain = urlparse(self.url).hostname.lstrip('www.') self.allowed_domains = [urlparse(self.url).hostname.lstrip('www.')] self.cookies_seen = set() def start_requests(self): return [Request(self.url, callback=self.parse)] def parse(self, response): page = self._get_item(response) r = [page] r.extend(self._extract_requests(response)) return r def _get_item(self, response): item = ContentAttributes( url=response.url, size=str(len(response.body)), referer=response.request.headers.get('Referer') ) self._set_content_data(item, response) self._set_new_cookies(item, response) return item def _extract_requests(self, response): r = [] if isinstance(response, Response): links = self.link_extractor.extract_links(response) r.extend(Request(x.url, callback=self.parse) for x in links) return r def _set_content_data(self, page, response): if isinstance(response, Response): title = Selector(response).xpath(self.config_items['title']).extract() content = Selector(response).xpath(self.config_items['body']).extract() try: published_date = Selector(response).xpath(self.config_items['publish_date']).extract() except KeyError: published_date = None try: images = Selector(response).xpath(self.config_items['image']).extract() except KeyError: images = Selector(response).xpath('%s//img/@src' % self.config_items['body']).extract() image_urls = [] for img_url in images: image_url_hostname = urlparse(img_url).hostname image_url_scheme = urlparse(img_url).scheme if image_url_hostname is None: img_url = "http://%s%s" % (self.real_domain, img_url) if image_url_scheme is None: img_url = "http://%s" % img_url image_urls.append(img_url) if title: page['title'] = title[0] if content: page['content'] = content[0] pubdate = 0 if published_date is not None: if published_date: day_pattern = self.config_items['publish_date_day_pattern'] month_pattern = self.config_items['publish_date_month_pattern'] year_pattern = self.config_items['publish_date_year_pattern'] time_pattern = self.config_items['publish_date_time_pattern'] pubdate = self._translate_publish_date_to_timestamp(published_date[0], day_pattern, month_pattern, year_pattern, time_pattern) page['publish_date'] = str(pubdate) page['image_urls'] = image_urls page['channel'] = self.channel page['domain'] = self.real_domain def _set_new_cookies(self, page, response): cookies = [] for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]: if cookie not in self.cookies_seen: self.cookies_seen.add(cookie) cookies.append(cookie) if cookies: page['newcookies'] = cookies def _parse_config(self, filename): config = {} f = open(filename, 'r') for line in f.readlines(): config_per_line = line.split(': ') if len(config_per_line) > 1: if config_per_line[0] != 'test_url': config[config_per_line[0]] = config_per_line[1].rstrip() if config_per_line[0] == 'publish_date_time_pattern': config[config_per_line[0]] = ':'.join(config_per_line[1:]).rstrip() f.close() return config def _translate_publish_date_to_timestamp(self, pubdate, day_pattern, month_pattern, year_pattern, time_pattern): try: re_day = re.compile(day_pattern) re_month = re.compile(month_pattern) re_year = re.compile(year_pattern) re_time = re.compile(time_pattern) try: day = re_day.findall(pubdate)[0] except IndexError: day = "00" try: month = re_month.findall(pubdate)[0].lower() except IndexError: month = "00" try: year = re_year.findall(pubdate)[0] except IndexError: year = "0000" try: parsed_time = re_time.findall(pubdate)[0] except IndexError: parsed_time = "00:00" try: pubdate_timestamp = convert_datetime_to_unix_timestamp("%s/%s/%s %s" % (year, MONTH_DICTIONARY["%s%s" % (month[0].upper(), month[1:])], day, parsed_time)) except KeyError: pubdate_timestamp = 0 return pubdate_timestamp except AttributeError: return 0
def __init__(self, *args): xpath = args[0] self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath)