class XinhuaNewsSpider(CrawlSpider): name = "xinhua_news_spider" start_urls = ['http://www.xinhuanet.com/'] allowed_domains = ['xinhuanet.com'] # http://news.xinhuanet.com/fortune/2017-11/10/c_1121937779.htm url_pattern = r'http://news.xinhuanet.com/([a-z]+)/*/2017-(\d{1,2})/(\d{1,2})/c\_(\d{6,10}).htm' rules = [ Rule(LxmlLinkExtractor(allow=[url_pattern]), callback='parse_news', follow=True) ] def parse_news(self, response): sel = Selector(response) title = sel.xpath('//div[@class="h-title"]/text()').extract() pattern = re.match(self.url_pattern, str(response.url)) source = 'xinhuanet.com' date = sel.xpath('//div[@class="h-info"]/span/text()').extract() time = sel.xpath('//div[@class="h-info"]/span/text()').extract() url = response.url newsId = re.findall(r'c_(.*?).htm', url, re.S)[0] contents = ListCombiner( sel.xpath('//div[@id="p-detail"]/p/text()').extract()) # comments= sel.xpath('//div[@class="right"]/span' comments = 0 item = NewsItem() item['source'] = source item['time'] = time item['date'] = date item['contents'] = contents item['title'] = title item['url'] = url item['newsId'] = newsId item['comments'] = comments yield item
def parse(self, response): for link in LxmlLinkExtractor( allow=self.allowed_domains).extract_links(response): # print(link.url) yield scrapy.Request(link.url, self.parse) # extracting descriptiona and meta from site soup = BeautifulSoup(response.text, "lxml") title = soup.title.string og_type = soup.find("meta", property="og:type") og_site_name = soup.find("meta", property="og:site_name") og_image = soup.find("meta", property="og:image") og_title = soup.find("meta", property="og:title") og_url = soup.find("meta", property="og:url") raw_text = soup.get_text() og_type = og_type.get("content", None) if og_type else None og_site_name = og_site_name.get("content", None) if og_site_name else None og_image = og_image.get("content", None) if og_image else None og_title = og_title.get("content", None) if og_title else None og_url = og_url.get("content", None) if og_url else None collection = self.db.pages # update or insert collection.update({"url": response.url}, { "$set": { "url": response.url, "domain": self.allowed_domains[0], "title": title, "og_type": og_type, "og_site_name": og_site_name, "og_image": og_image, "og_title": og_title, "og_url": og_url, "raw_text": raw_text } }, upsert=True)
class MushroomWorldSpider(CrawlSpider): name = "mushroom_world_spider" start_urls = ["http://www.mushroom.world/mushrooms/namelist"] rules = ( Rule(LxmlLinkExtractor( restrict_xpaths=("//div[@class='item']")), follow=True, callback='parse_item' ), ) def parse_item(self, response): name = response.css(".caption b ::text").extract_first().strip() description = response.css(".longtextus ::text").extract_first() family = response.css(".textus ::text").extract_first() location = response.xpath(".//textus[1]/text()").extract() dimensions = response.css("#mushroom-list:nth-child(2) ::text").extract() edibility = response.css("#mushroom-list:nth-child(3) ::text").extract() yield MushroomWorldItem(name=name, description=description, family=family, location=location, dimensions=dimensions, edibility=edibility)
def parse(self, response): ##response only from body # html = requests.get(response,timeout=40) # bs = BeautifulSoup(html.text) # body = bs.find("body") # print(response.url,len(response.body)) # responseb = HtmlResponse(url=html.url,body=str(body),encoding="utf-8") responseb = HtmlResponse(url=response.url, body=response.body) linkObjs = LxmlLinkExtractor().extract_links(responseb) ##include(?):re.search("(([.]css|;)$|javascript|mailto:|tel:)",i) is None ##keep links orbisweb (not optimal yet!) ##include %s/ ? pattern = "([.]%s|%s[.])" % (self.allow, self.allow) links = [ l.url for l in linkObjs if re.search(pattern, l.url) is not None ] links = list(set(links)) self.nlinks = len(links) for l in links: #self.successParse(l) yield self.successParse(l)
def parse(self, response): # print("%s : %s : %s" % (response.status, response.url, response.text) ) # print title text with css and xpath selectors title_text = response.css('title::text') print(title_text.get()) title_text = response.xpath('//title[1]/text()') print(title_text.get()) # Get all anchor tags with css and xpath selectors css_links = response.css('a::attr(href)').getall() xpath_links = response.xpath('//a/@href').getall() print(len(css_links)) print(len(xpath_links)) for (link, xlink) in zip(css_links, xpath_links): print('{} {} '.format(link, xlink)) # fetch url from github and avoid social media sites trending_links = LxmlLinkExtractor(allow= r'^https://[a-z.]+/[a-z.]+$', deny_domains=['shop.github.com','youtube.com','twitter.com'], unique = True).extract_links(response) for link in trending_links: print("%s : %s " % (link.url, link.text))
def parse(self, response: Response) -> Iterator[Request]: self.log(response) if getattr(self, 'validate_html', False): yield Request( 'http://127.0.0.1:9988/?out=json', method='POST', headers={'Content-Type': response.headers['Content-Type']}, body=response.body, callback=self._vnu_callback(response.url), errback=self.error_callback, ) for link in LxmlLinkExtractor( deny_domains=self.deny_domains, deny_extensions=['doc'], tags=self.tags, attrs=self.attrs, deny=self.deny, canonicalize=False).extract_links(response): yield from self._make_requests(link.url)
class BaseSpider(scrapy.Spider): name = "base" link_extractor = LxmlLinkExtractor(allow=(), deny=(), allow_domains=(["example.com"]), deny_domains=(), deny_extensions=None, restrict_xpaths=(), restrict_css=(), tags=('a', 'area'), attrs=('href', ), canonicalize=False, unique=True, process_value=None, strip=True) def start_requests(self): base_url = 'http://example.com/' yield scrapy.Request(url=base_url, callback=self.parse) def parse(self, response): links = self.link_extractor.extract_links(response) links_processing_start = time.time() for link in links: yield {'source_url': response.url, 'destination_url': link.url} yield scrapy.Request(url=link.url, callback=self.parse) print("TOTAL TIME TOOK | " + str(time.time() - links_processing_start) + " | TOTAL LINKS COUNT | " + str(len(links))) yield { 'url': response.url, 'download_time': 0, 'download_latency': response.meta['download_latency'] } os._exit(1) def parseError(self, response): print('Error')
class ZhihuSpider(CrawlSpider): name = 'zhihu2' allowed_domains = ['zhuanlan.zhihu.com'] start_urls = [ 'https://zhuanlan.zhihu.com/bankk', ] rules = ( Rule(LinkExtractor(allow=('https://zhuanlan.zhihu.com/(\w+)*$', )), callback='parse_item', follow=True), Rule(LinkExtractor(allow=('https://zhuanlan.zhihu.com/p/(\d+)*$', )), callback='parse_item', follow=True), Rule(LxmlLinkExtractor(allow=('/p/(\d+)*$', ), tags=('a', ), attrs=('href', ), process_value='add_links'), callback='parse_item', follow=True), ) # 这里使用了pipline来进行存储item的工作 # 当需要多使用不同的pipline时,删除setting中的pipline,在各个spider中自己定义pipline,或者在pipline中进行逻辑判断 # custom_settings = { # 'ITEM_PIPELINES': { # 'myscrapytest.pipelines.ZhihuPipeline': 400 # } # } def parse_item(self, response): soup = BeautifulSoup(response.text, 'lxml') title = soup.title.string item = ZhihuItem() item['title'] = title item['url'] = response.url yield item # self.log('Saved file %s' % filename) def add_links(self, value): return 'https://zhuanlan.zhihu.com' + value
class ImdbSpider(CrawlSpider): name = 'imdb' allowed_domains = ['www.imdb.cn'] rules = [ Rule(LxmlLinkExtractor(allow=r'/title/tt\d+$'), callback='parse_imdb', follow=True) ] def start_requests(self): pages = [] for i in range(1, 14616): url = "http://www.imdb.cn/nowplaying/" + str(i) yield Request(url=url, callback=self.parse) def parse_imdb(self, response): item = CrawldemoItem() item['url'] = response.url item['title'] = ''.join( response.xpath( '//*[@class="fk-3"]/div[@class="hdd"]/h3/text()').extract()) yield item
def parse_page(self, response): """ General page parser :param response: :return: """ links_visit = set() links = set() for link in LxmlLinkExtractor(allow=(), deny=()).extract_links(response): links.add(link.url) logger.info('Current url: %s' % response.url) logger.info('Current resp: %s' % response) # Search result - container element lists = response.xpath( '//div[@id="primary"]//div[@class="listWidget"]') for list_widget in lists: logger.debug('List widget: %s' % list_widget) eapp = list_widget.xpath('div[@class="appRow"]') einfo = list_widget.xpath('div[@class="infoSlide"]') if len(eapp) == 0: logger.warning('No results') return for eapp1 in eapp: logger.debug(eapp1) #ahref = eapp1.xpath('div/div/div/h5/a')[0] #link = ahref.attrib['href'] #title = ahref.xpath('text()') #logger.debug('Title / link %s %s ' % (title, link)) logger.debug('Extracted %s links from %s' % (len(links_visit), response.url)) for link in list(links_visit): pass
class EuropythonSpyder(CrawlSpider): def __init__(self, year='', *args, **kwargs): super(EuropythonSpyder, self).__init__(*args, **kwargs) self.year = year self.start_urls = [ 'http://ep' + str(self.year) + ".europython.eu/en/events/sessions" ] print('start url: ' + str(self.start_urls[0])) name = "europython_spyder" allowed_domains = [ "ep2015.europython.eu", "ep2016.europython.eu", "ep2017.europython.eu", "ep2018.europython.eu" ] # Pattern for entries that match the conference/talks format rules = [ Rule(LxmlLinkExtractor(allow=['conference/talks']), callback='process_response') ] def process_response(self, response): item = EuropythonItem() print(response) item['title'] = response.xpath( "//div[contains(@class, 'grid-100')]//h1/text()").extract() item['author'] = response.xpath( "//div[contains(@class, 'talk-speakers')]//a[1]/text()").extract() item['description'] = response.xpath( "//div[contains(@class, 'cms')]//p//text()").extract() item['date'] = response.xpath( "//section[contains(@class, 'talk when')]/strong/text()").extract( ) item['tags'] = response.xpath( "//div[contains(@class, 'all-tags')]/span/text()").extract() return item
class Comics(CrawlSpider): ''''scrapy spider. inherit from CrawlSpider. ''' name = "comics" allowed_domains = ["www.tazhe.com"] start_urls = ["http://www.tazhe.com/mh/"] rules = [ Rule(LxmlLinkExtractor(allow=(r'http://www.tazhe.com/mh/\d+')), callback="parse_item"), ] def __init__(self, *args, **kwargs): super(Comics, self).__init__(*args, **kwargs) def parse_item(self, response): '''rewrite class method. ''' #from scrapy.shell import inspect_response #inspect_response(response, self) sel = response.selector item = ComicsItem() item['name'] = sel.xpath( '//*[@id="intro_l"]/div[1]/h1/text()').extract() item['author'] = sel.xpath( '//*[@id="intro_l"]/div[2]/p[2]/text()').extract() item['update_time'] = sel.xpath( '//*[@id="intro_l"]/div[2]/p[1]/span/text()').extract() item['last_update'] = sel.xpath( '//*[@id="intro_l"]/div[1]/span/font/text()').extract() item['classification'] = sel.xpath( '//*[@id="intro_l"]/div[2]/p[5]/a/text()').extract() item['introduction'] = sel.xpath( '//*[@id="intro1"]/p/text()[1]').extract() item['url'] = response.url return item
def parse_obj(self, response): item = MyItem() item['url'] = [] for link in LxmlLinkExtractor( allow=(), deny=(), deny_extensions=None, tags=('a', 'area', 'q', 'meta', 'track', 'object', 'style', 'video', 'applet', 'body', 'button', 'del', 'head', 'html', 'input', 'ins', 'img', 'source', 'base', 'blockquote', 'embed', 'form', 'frame', 'iframe', 'link', 'script'), attrs=('href', 'src', 'data', 'archive', 'codebase', 'poster', 'code', 'cite', 'background', 'formaction', 'profile', 'xmlns', 'ping', 'longdesc', 'srcset', 'action', 'srcdoc', 'scheme'), process_value=None, unique=True).extract_links(response): is_allowed = False is_regex_output = False for allowed_domain in self.allowed_domains: if re.match("^https?:\/\/" + allowed_domain, link.url) is not None: is_allowed = True if re.match("^https?:\/\/" + self.regex_output, link.url) is not None: is_regex_output = True if is_allowed: item['url'].append(link.url) if is_regex_output: z = open("re-match-urls.txt", "a") z.write(link.url + "\n") z.close() else: f = open("other-urls.txt", "a") f.write(link.url + "\n") f.close() return item
def parse(self, response): logger.info('jobdiva|url in parse %s', response.url) self.crawler.stats.inc_value('completed_url', 1) self.crawler.stats.set_value('spider', 'jobdiva') response_value = -2 temp = {'urls': []} tags = ['span', 'td'] item = parse_fields(self.crawl_request, response, response_value, tags) iframe_url = response.css('iframe::attr(src)').extract() for url in iframe_url: for allowed_domain in self.allowed_domains: response_value = url.find(allowed_domain) if response_value >= 0: yield scrapy.Request(url=url, callback=self.parse) if len(item) is not 0: yield item for link in LxmlLinkExtractor( allow_domains=self.allowed_domains).extract_links(response): url = response.urljoin(link.url) temp['urls'].append(url) yield scrapy.Request(url=url, callback=self.parse)
def parse_obj(self, response): """ Base parsing routine - pure link extractor :param response: :return: """ links_visit = set() links = set() for link in LxmlLinkExtractor(allow=(), deny=()).extract_links(response): links.add(link.url) # Another filter if desired if self.should_follow_link(link.url, response): links_visit.add(link.url) for d in list(links): item = LinkItem() item['url'] = d yield item for d in list(links_visit): yield Request(d)
def parse(self, response: Any) -> Generator[Request, None, None]: self.log(response) for link in LxmlLinkExtractor( deny_domains=self.deny_domains, deny_extensions=['doc'], tags=self.tags, attrs=self.attrs, deny=self.deny, canonicalize=False).extract_links(response): callback = self.parse # type: Any dont_filter = False method = 'GET' if self._is_external_url(link.url): callback = self.check_existing method = 'HEAD' elif '#' in link.url: dont_filter = True callback = self.check_permalink yield Request(link.url, method=method, callback=callback, dont_filter=dont_filter, errback=self.error_callback)
def parse(self, response): parsed_uri = urlparse(response.url) domainurl = '{uri.netloc}'.format(uri=parsed_uri) # If the amount of downloaded pages of one site exceeds the limit, all following requests of the same domain will be removed from the queue if int(job_redis.hlen(domainurl)) > self.maximumPagesPerSite: regex = re.compile(r'\b'+domainurl+'\b') if len(filter(lambda i: regex.search(i), self.start_urls))>0: for item in filter(lambda i: regex.search(i), self.start_urls): self.start_urls.remove(item) return # Remove urls containing anchor mark, phone numbers, emails and login pages for link in LxmlLinkExtractor(deny=[r'[\S\s]*#[\S\s]*',r'[\S\s]*\/tel:[\S\s]*',r'[\S\s]*\/fax:[\S\s]*',r'[\S\s]*\/mailto:[\S\s]*',r'[\S\s]*\/login[\S\s]*',r'[\S\s]*\/\+[0-9]*$'],allow_domains=self.allow_domains).extract_links(response): if int(job_redis.hlen(domainurl)) > self.maximumPagesPerSite: break else: self.start_urls.append(link.url) # Add sites having respond code from 400 to 600 to a list if response.status in range(400, 600): job_redis.sadd('error',response.url) else: item=StandaloneItem() tempinput=response.xpath("//body") #Extract the domain, title ,text and url of a website if tempinput: templist=[] templist.append(re.sub(r'\s+', ' ',tempinput.extract()[0].strip())) item['domain']=[domainurl] item['data'] = templist item['title']=response.xpath("normalize-space(//title)").extract() item['link']=[response.url] return item else: job_redis.sadd('error',response.url)
class DaneSpider(CrawlSpider): formatter = LinksFormatter() name = "Dane" domain = "funes.uniandes.edu.co" allowed_domains = [domain] start_urls = ('http://' + domain, ) rules = [ Rule(LxmlLinkExtractor(allow=(), allow_domains=domain, process_value=formatter.formatLink), 'parsePages', follow=True) ] def parsePages(self, response): linkExtractor = LxmlLinkExtractor( deny_extensions=[], process_value=self.formatter.formatLink) item = ScraperdaneItem() item["name"] = response.url item["children"] = [ link.url for link in linkExtractor.extract_links(response) ] return item
class QuotesSpider(CrawlSpider): name = "wiki" allowed_domains = ["en.wikipedia.org"] deny = [ '#', 'index.php', 'Wikipedia:', 'Portal:', 'Special:', 'Help:', 'Talk:', 'File:', 'User:'******'Template:', 'Category:', '/Main_Page' ] start_urls = [ 'https://en.wikipedia.org', ] rules = (Rule(LxmlLinkExtractor(allow_domains=allowed_domains, deny=deny), callback='parse_obj', follow=True), ) def parse_obj(self, response): item = MyItem() item['url'] = [] for link in LxmlLinkExtractor(allow_domains=self.allowed_domains, deny=self.deny).extract_links(response): item['url'].append(link.url.rstrip('/')) yield {'url': response.url.rstrip('/'), 'items': item['url']}
class JxcbwNewsSpider(CrawlSpider): name = "jxcbw_news_spider" allowed_domains = ['jxcbw.cn'] start_urls = ['http://www.jxcbw.cn/mainpages/default.aspx'] #http://www.jxcbw.cn/mainpages/NewsInfo.aspx?NewsID=69366&NewsType=LE123 #http://www.jxcbw.cn/mainpages/NewsInfo.aspx?NewsID=70152&NewsType=LE107 url_pattern = r'http://www.jxcbw.cn/[a-z]+/NewsInfo.aspx?(NewsID=\d{3,8}&NewsType=LE\d{1,7})' rules = [ Rule(LxmlLinkExtractor(allow=[url_pattern]), callback='parse_news', follow=True) ] def parse_news(self, response): sel = Selector(response) pattern = re.match(self.url_pattern, str(response.url)) source = 'www.jxcbw.cn' time = sel.xpath('//span[@class="time fl"]/text()').extract() date = time[0] title = sel.xpath('//h2[@class="title-class2"]/text()').extract() newsId = pattern.group(1) url = response.url # if sel.xpath('//div[@id="content"]/div/text()'): # contents = ListCombiner(sel.xpath('//div[@id="content"]/div/text()').extract()) # else: # contents = "unknown" comments= 0 contents = 0 item = NewsItem() item['source'] = source item['title'] = title item['date'] = date item['time'] = time item['newsId'] = newsId item['url'] = url item['contents'] = contents item['comments'] = comments yield item
class A80sSpider(CrawlSpider): name = '80s' allowed_domains = ['www.80s.tw'] def start_requests(self): for page in range(1, self.settings.get('MAX_PAGE') + 1): url = 'https://www.80s.tw/movie/list/-----p/{page}'.format( page=page) yield scrapy.Request(url=url, ) rules = (Rule(LxmlLinkExtractor( allow='/movie/\d+', restrict_css= '#block3 > div.clearfix.noborder.block1 > ul.me1.clearfix > li > a'), callback='parse_detail'), ) def parse_detail(self, response): item = Movie1Item() item['href'] = response.css( '#myform > ul > li.clearfix.dlurlelement.backcolor1 > span.dlname.nm > span > a::attr(href)' ).extract_first() item['title'] = response.css( '#minfo > div.info > h1::text').extract_first() yield item
class axxelerate_spider(CrawlSpider): name = 'axxelerate' allowed_domains = ['en.wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Main_Page'] rules = (Rule(LxmlLinkExtractor(allow=(allowed_domains)), callback='parse_obj', follow=True),) def parse_obj(self,response): item = url_item() item['url'] = response.url item['keywords'] = [] tags = ["h1", "title", "article", "div", "blockquote", "td", "li", "p", "span", "strong", "b", "i"] for tag in tags: texts = response.xpath("//%s/text()" % (tag)).extract() for text in texts: text = text.encode("latin1", "ignore") result = modify_query.query(text) item['keywords'] = item['keywords'] + result item['title'] = response.xpath("//title/text()").extract_first() item['keywords'] = set(item['keywords']) item['linksTo'] = [] for link in LxmlLinkExtractor(allow=(),deny = ()).extract_links(response): if link.url.startswith('https://en.wikipedia.org'): item['linksTo'].append(link.url) return item
def parseLinks(self, response): links = LxmlLinkExtractor(allow=('http://www.bentleyhomes.com.au/properties/[\w-]+/$')).extract_links(response) for link in links: yield Request(link.url, callback=self.parseItem, meta=response.meta)
class BentleyhomesSpider(CrawlSpider): name = 'bentleyhomes' allowed_domains = ['www.bentleyhomes.com.au'] start_urls = ['http://www.bentleyhomes.com.au/'] rules = ( Rule(LxmlLinkExtractor(allow=('http://www.bentleyhomes.com.au/home-designs/search-home-designs/$')), follow=True, callback='parseForm'), Rule(LxmlLinkExtractor(allow=('http://www.bentleyhomes.com.au/house-and-land/browse-our-hl-packages/$')), follow=True, callback='parseList'), Rule(LxmlLinkExtractor(allow=('http://www.bentleyhomes.com.au/displays-homes/ex-display-homes-for-sale/$')), follow=True), ) oth = ('Games','Studio','Games Room','Leisure','Rumpus','Rooms','Grand Living','Bedroom 5', 'Living','Retreat','M.P.R') logo = 'Bentley Homes' def parseForm(self,response): if response.url.find('home-designs') != -1: callback = self.parseLinks elif response.url.find('browse-our-hl-packages') != -1: callback = self.parseList for i in range(1,4): formdata = {'storeys_filter': str(i), 'submit': 'Search'} yield FormRequest(response.url, formdata=formdata, meta={'Storey':str(i)}, callback=callback) def parseLinks(self, response): links = LxmlLinkExtractor(allow=('http://www.bentleyhomes.com.au/properties/[\w-]+/$')).extract_links(response) for link in links: yield Request(link.url, callback=self.parseItem, meta=response.meta) def parseList(self,response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) hxsItemsList = hxs.xpath('//div[@class="property-item"]') for hxsItem in hxsItemsList: l = RealtyLoader(RealtyspidersItem(), hxsItem) l.add_value('url', response.url) l.add_value('BuildType', 'Browse our H&L packages') l.add_value('BuilderLogo', self.logo) l.add_xpath('Lot_BlockAddress', './/span[@class="street"]/text()') l.add_xpath('Squares', './/span[@class="area"]/text()') l.add_xpath('Bedrooms', '//li[@class="beds"]/text()') l.add_xpath('Bathrooms', '//li[@class="baths"]/text()') l.add_xpath('Garage', '//li[@class="garages"]/text()') l.add_xpath('LivingArea', '//li[@class="storeys"]/text()') l.add_xpath('BasePrice', './/div[@class="field-prefix" and text()="$"]/following-sibling::div[@class="field-value"]/text()') l.add_xpath('HomeDesignMainImage', './/img/@src') yield l.load_item() def parseItem(self, response): referer = response.request.headers.get('Referer', None).decode("utf-8") hxs = HtmlXPathSelector(response) # with open('testURL', 'a') as file: # file.write(str(response.meta)+ '\n') # file.writelines('\n'.join(hxs.xpath('//div[@class="col-md-8"]/table/tbody/tr/td[1]/text()').extract())) roomsXpath = '''//div[@class="room_dimensions overview_table"] //tr/td[text()="Master Bedroom"]/following-sibling::td/text()''' overviewXpath = '''//table[@id="hf-property-overview"]/tr/td/div[text()="{}"]/ancestor::td/following-sibling:: td[@class="item-value"]/div/div[@class="field-value"]/text()''' imgXpath = '//div[@class=" flexslider_gallery image hf-property-gallery"]/div/ul/li[{}]/img/@src' descriptionXPath = '//div[@id="col-md-8"]/p/text()' # data = hxs.xpath(roomsXpath).extract() # with open('testURL','a') as file: # for i in data: # file.write(i+'\n') other = [] for name in self.oth: size = hxs.xpath(roomsXpath.format(name)).extract_first() if size: other.append('{}:{}'.format(name, size)) l = RealtyLoader(RealtyspidersItem(), hxs) l.add_value('url', response.url) l.add_value('BuildType', self._getBuildType(referer)) l.add_value('BuilderLogo', self.logo) l.add_xpath('DesignName', '//h3[@class="title-post"]/text()') l.add_value('State', 'MELBOURNE') l.add_xpath('Squares', '//div[@class="info-box1 "]/p[1]/text()') l.add_xpath('Bedrooms', '//li[@class="beds"]/text()') l.add_xpath('Bathrooms', '//li[@class="baths"]/text()') l.add_xpath('Garage', '//li[@class="garages"]/text()') l.add_xpath('BasePrice', '//div[@class="field-prefix" and text()="$"]/following-sibling::div[@class="field-value"]/text()') l.add_value('Storey', self._getStorey(response.meta['Storey'])) l.add_xpath('HouseWidth', '//div[text()="MIN. BLOCK WIDTH"]/text()[2]') l.add_xpath('HouseLength', '//div[text()="\n MIN. BLOCK LENGTH"]/text()[2]') l.add_xpath('BrochureImage_pdf', '//a[text()="Brochure"]/@href') l.add_xpath('InclusionsImage_pdf', '//a[text()="Inclusions"]/@href') l.add_xpath('FloorPlanImage1', '//a[@class="floor-plan fancybox"]/img/@src') l.add_xpath('HomeDesignMainImage', imgXpath.format('1')) l.add_xpath('Image1', imgXpath.format('2')) l.add_xpath('Image2', imgXpath.format('3')) l.add_xpath('Image3', imgXpath.format('4')) l.add_xpath('Image4', imgXpath.format('5')) l.add_xpath('Image5', imgXpath.format('6')) l.add_xpath('Image6', imgXpath.format('7')) l.add_xpath('Image7', imgXpath.format('8')) l.add_xpath('Image8', imgXpath.format('9')) l.add_xpath('Image9', imgXpath.format('10')) l.add_xpath('Image10', imgXpath.format('11')) l.add_xpath('Image11', imgXpath.format('12')) l.add_xpath('Image12', imgXpath.format('13')) l.add_xpath('Image13', imgXpath.format('14')) l.add_xpath('Image14', imgXpath.format('15')) l.add_xpath('Image15', imgXpath.format('16')) l.add_xpath('MasterBedroomDimension', roomsXpath.format('Master Bedroom')) l.add_xpath('Bedroom2Dimension', roomsXpath.format('Bedroom 2')) l.add_xpath('Bedroom3Dimension', roomsXpath.format('Bedroom 3')) l.add_xpath('Bedroom4Dimension', roomsXpath.format('Bedroom 4')) l.add_xpath('StudyDimension', [roomsXpath.format('Study'),roomsXpath.format('Study nook')]) l.add_xpath('Meals_DiningDimension', roomsXpath.format('Meals')) l.add_xpath('FamilyDimension', roomsXpath.format('Family')) l.add_xpath('AlfrescoDimension', roomsXpath.format('Alfresco')) l.add_xpath('LoungeDimension', roomsXpath.format('Lounge')) l.add_xpath('TheatreDimension', roomsXpath.format('Theatre')) l.add_value('OtherInclusions', ', '.join(other)) # Block Yes No l.add_xpath('TheatreRoom_Yes_No', roomsXpath.format('Theatre')) l.add_xpath('SeparateMeals_Yes_No', roomsXpath.format('Meals')) l.add_xpath('Alfresco_Yes_No', roomsXpath.format('Alfresco')) l.add_xpath('Study_Yes_No', [roomsXpath.format('Study Nook'),roomsXpath.format('Study')]) l.add_xpath('WalkinPantry_Yes_No', descriptionXPath, **{'re': '([Ww]alkin|[Pp]antry)'}) l.add_xpath('BultersPantry_Yes_No', descriptionXPath, **{'re': '[Bb]ulter[`]?s?'}) l.add_xpath('SteelStructure_Yes_No', descriptionXPath, **{'re': '([Ss]teel.*[Ss]tructure)|([Ss]tructure.*[Ss]teel)'}) l.add_xpath('Balcony_Yes_No', roomsXpath.format('Balcony')) # # Гарантія l.add_xpath('SturturalWarranty', descriptionXPath, **{'re': '.*guarantee.*|.*[Ww]arranty.*'}) # Вікна l.add_xpath('Windows', descriptionXPath, **{'re': '.*[Ww]indows?.*'}) # Кухонна плита l.add_xpath('KitchenBenchtop', descriptionXPath, **{'re': '.*[Kk]itchen.*[Bb]enchtop.*|.*[Bb]enchtop.*[Kk]itchen.*'}) # Сигналізація l.add_xpath('SecuritySystem', descriptionXPath, **{'re': '.*[Ss]ecurity.*[sS]ystem.*}.*[sS]ystem.*[Ss]ecurity.*'}) # Клас енергозбереження l.add_xpath('EnergyRating', descriptionXPath, **{'re': '.*[Ee]nergy.*[rR]ating.*|.*[rR]ating.*[Ee]nergy.*'}) # Кухонне приладдя l.add_xpath('KitchenAppliance', descriptionXPath, **{'re': '.*([Kk]itchen.*[Aa]ppliance).*|.*([Aa]ppliance.*[Kk]itchen).*'}) # Бренд пристрою l.add_xpath('ApplianceBrand', descriptionXPath, **{'re': '.*[\w\s]+[Ss]ecurity System.*'}) # Kахель над умивальної раковиною l.add_xpath('Splashback', descriptionXPath, **{'re': '.*[Ss]plashback.*'}) # Покриття підлоги l.add_xpath('FloorCovering', descriptionXPath, **{'re': '.*[Ff]loor.*[Cc]overings?.*|.*[Cc]overings?.*[Ff]loor.*'}) # Охолодження l.add_xpath('Cooling', descriptionXPath, **{'re': '.*[Cc]ooling.*'}) # Ванна l.add_xpath('Bath', descriptionXPath, **{'re': '.*[Ss]ecurity.*[Ss]ystem.*'}) # Висота стели l.add_xpath('CeilingHeight', descriptionXPath, **{'re': '.*[Bb]ath.*'}) # Плитка в ванній l.add_xpath('EnsuiteWallTiling', descriptionXPath, **{'re': '.*[Tt]ile.*'}) # Плита в ванній l.add_xpath('EnsuiteBenchtop', descriptionXPath, **{'re': '.*[Ee]nsuite.*[Bb]enchtop.*|.*[Bb]enchtop.*[Ee]nsuite.*'}) # Душова l.add_xpath('EnsuiteShowerbase', descriptionXPath, **{'re': '.*[Ss]howerbase.*'}) # Фарба на стінах l.add_xpath('WallPaint', descriptionXPath, **{'re': '.*[Ww]all.*[Pp]aint.*|.*[Pp]aint.*[Ww]all.*'}) # Гардероб l.add_xpath('WIRFitouts', descriptionXPath, **{'re': '.*walk in robe.*|.*WIR.*'}) # Світильники l.add_xpath('Downlights', descriptionXPath, **{'re': '.*[Dd]ownlights.*'}) # Ландшафтний дизайн l.add_xpath('Landscaping', descriptionXPath, **{'re': '.*[Ll]andscaping.*'}) # Дорожка до дому l.add_xpath('Driveway', descriptionXPath, **{'re': '.*[Dd]riveway.*'}) # Реклама l.add_xpath('Promotion', descriptionXPath, **{'re': '.*[Pp]romotion.*'}) # # # інші штуки # # l.add_xpath('OtherInclusions', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions1', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions2', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions3', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions4', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) # # l.add_xpath('OtherInclusions5', # # descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'}) return l.load_item() def _getBuildType(self, url): if url.find('dual-occupancy') != -1: return 'Dual Occupancy' elif url.find('ex-display-homes-for-sale') != -1: return 'Display Homes for Sale' elif url.find('view-displays-homes') != -1: return 'Display Homes' elif url.find('completed-homes') != -1: return 'Completed Homes for Sale' elif url.find('search-home-designs') != -1: return 'Home Designs' elif url.find('browse-our-hl-packages') != -1: return 'H&L packages' # elif url.find('ex-display-homes-for-sale') != -1: # return 'Display Homes for Sale' def _getStorey(self, data): if data == '1': return 'Single' elif data == '2': return 'Double' elif data == '3': return 'Split livel'
class DarkWebSpider(CrawlSpider): name = 'darkWebBot' allowed_domains = ["onion"] start_urls = [ "https://ahmia.fi/address/" # "https://ahmia.fi/address/" #"http://check.torproject.org/" ] rules = (Rule(LxmlLinkExtractor(allow=()), callback="parse_item", follow=True), ) def parse_item(self, response): # #i = response.xpath('//h1/@class').extract()[0] # #i['name'] = response.xpath('//div[@id="name"]').extract() # #i['description'] = response.xpath('//div[@id="description"]').extract() # f = open("/Users/laveeshrohra/Documents/Workspace/checkPolipo.txt", "w+") # f.write("class = %s" % (response.body)) # f.close() hxs = HtmlXPathSelector(response) item = CrawledWebsiteItem() item['url'] = response.url item['server_header'] = str(response.headers) title_list = hxs.xpath('//title/text()').extract() h1_list = hxs.xpath("//h1/text()").extract() item['h1'] = " ".join(h1_list) h2_list = hxs.xpath("//h2/text()").extract() item['h2'] = " ".join(h2_list) title = ' '.join(title_list) item['title'] = title body_text = self.html2string(response) words = self.extract_words(body_text) item['text'] = title + " " + " ".join(words) return item def detect_encoding(self, response): return response.headers.encoding or "utf-8" def html2string(self, response): """HTML 2 string converter. Returns a string.""" converter = html2text.HTML2Text() converter.ignore_links = True encoding = self.detect_encoding(response) decoded_html = response.body.decode(encoding, 'ignore') string = converter.handle(decoded_html) return string def extract_words(self, html_string): """Stems and counts the words. Works only in English!""" string_list = re.split(r' |\n|#|\*', html_string) # Cut a word list that is larger than 10000 words if len(string_list) > 10000: string_list = string_list[0:10000] words = [] for word in string_list: # Word must be longer than 0 letter # And shorter than 45 # The longest word in a major English dictionary is # Pneumonoultramicroscopicsilicovolcanoconiosis (45 letters) if len(word) > 0 and len(word) <= 45: words.append(word) return words
class FoneArenaSpider(AlaCrawlSpider): name = 'fonearena' allowed_domains = ['fonearena.com'] start_urls = ['http://www.fonearena.com/reviews.php'] rules = [Rule(LxmlLinkExtractor(restrict_xpaths='//figure[@class="effect3"]/a', unique=True), callback="parse_review"), Rule(LxmlLinkExtractor(restrict_xpaths='//a[@title="next page"]', unique=True)) ] def parse_review(self, response): if not response.url.endswith(".php"): product = ProductItem() review = ReviewItem() review['TestTitle'] = self.extract(response.xpath('//h2/text()')) if review['TestTitle']: matches = re.search("^(.*?) review", review['TestTitle'], re.IGNORECASE) if matches: review['ProductName'] = matches.group(1) product['ProductName'] = matches.group(1) else: review['ProductName'] = review['TestTitle'] product['ProductName'] = review['TestTitle'] review['Author'] = self.extract(response.xpath('//a[@rel="author"]/text()')) date_span = self.extract(response.xpath('//span[@class="updated"]/text()')) if date_span: matches = re.search(r'(\S+ \d+, \d+) ', date_span) if matches: date_span = matches.group(1) review['TestDateText'] = date_format(date_span, '%B %d, %Y') product['PicURL'] = self.extract(response.xpath('//div[contains(@class,"entry")]/p//img/@src')) review['TestSummary'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/p[1]//text()'), separator=" ") if not review['TestSummary']: review['TestSummary'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/p[2]//text()'), separator=" ") review['TestVerdict'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/p[strong[contains(text(),"Conclusion")]]/following-sibling::p/text()'), separator=" ") if not review['TestVerdict']: review['TestVerdict'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/h2[contains(text(),"Conclusion")]/following-sibling::p/text()'), separator=" ") review['TestPros'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/p[strong[contains(text(),"Pros")]]/following-sibling::*[1]/li/text()'), separator="; ") if not review['TestPros']: review['TestPros'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/h3[contains(text(),"Pros")]/following-sibling::*[1]/li/text()'), separator="; ") review['TestCons'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/p[strong[contains(text(),"Cons")]]/following-sibling::*[1]/li/text()'), separator="; ") if not review['TestCons']: review['TestCons'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/h3[contains(text(),"Cons")]/following-sibling::*[1]/li/text()'), separator="; ") product['OriginalCategoryName'] = "Miscellaneous" review['DBaseCategoryName'] = "PRO" product['TestUrl'] = response.url review['TestUrl'] = response.url yield product yield review
class imdb_spider(CrawlSpider): settings = get_project_settings() name = "imdb_spider" allowed_domains = ['imdb.com'] start_urls = ['https://www.imdb.com/search/title?release_date=1980-01-01,'] deny_urls = [''] with open(settings.get('DENIED_DOMAINS')) as f: content = f.readlines() no_domains = [x.strip() for x in content] no_ext = [''] tags = [ 'a', 'area', 'audio', 'embed', 'iframe', 'img', 'input', 'script', 'source', 'track', 'video', 'form' ] # attrs = ['href', 'src', 'action'] attrs = ['href'] people_links = {} detail_fields = [ "Taglines:", "Country:", "Language:", "Budget:", "Cumulative Worldwide Gross:", "Production Co:" ] director_fields = ["Director:", "Writers:"] movie_link = r'/title/\w+/\?ref_=adv_li_tt' nextpage_link = r'/search/title\?release_date=1980-01-01,&start=\d+&ref_=adv_nxt' rules = ( Rule(LxmlLinkExtractor(allow=movie_link), callback='parse_movie', follow=False), Rule(LxmlLinkExtractor(allow=nextpage_link), callback='parse_nextpage', follow=True), ) def parse_nextpage(self, response): print("[ PAGE ] {}".format(response.request.url)) def parse_movie(self, response): # logger.info(">>>>> Movie: {}".format(response.request.url)) print("[ MOVIE ] {}".format(response.request.url)) # inputs movie_id = response.request.url.split('/')[4] title = ''.join( list( filter( lambda x: x in string.printable, response.xpath('//div[@class="title_wrapper"]/h1/text()'). extract_first().strip()))) film_rating = response.xpath( '//div[@class="subtext"]/text()').extract_first() duration = response.xpath( '//div[@class="subtext"]/time/text()').extract_first() genre = ''.join( list( map( str.strip, str( response.xpath( '//div[@class="subtext"]/a[not(@title="See more release dates")]/text()' ).extract())))) release_date = response.xpath( '//div[@class="subtext"]/a[@title="See more release dates"]/text()' ).extract_first() imdb_ratingValue = response.xpath( '//span[@itemprop="ratingValue"]/text()').extract_first() imdb_bestRating = response.xpath( '//span[@itemprop="bestRating"]/text()').extract_first() imdb_ratingCount = response.xpath( '//span[@itemprop="ratingCount"]/text()').extract_first() description = response.xpath( '//div[@class="summary_text"]/text()').extract_first() storyline = response.xpath( '//div[@id="titleStoryLine"]/div/p/span/text()').extract_first() lables = response.xpath( '//div[contains(@class, "plot_summary")]/div[@class="credit_summary_item"]/h4/text()' ).extract() credits = dict.fromkeys(['director', 'creator', 'writer', 'stars']) k = 0 for x in lables: persons = response.xpath( '//div[contains(@class, "plot_summary")]/div[' + str(k) + '][@class="credit_summary_item"]/a/text()').extract() if 'See full cast & crew' in persons: persons.remove('See full cast & crew') # remove comments between brakets or parenthesis persons = [ re.sub("[\(\[].*?[\)\]]", "", p).strip() for p in persons ] # director(s), creator(s), writer(s), stars if 'director' in x.lower(): credits['director'] = persons if 'creator' in x.lower(): credits['creator'] = persons if 'writer' in x.lower(): credits['writer'] = persons if 'star' in x.lower(): credits['stars'] = persons k += 1 taglines = ''.join( response.xpath( '//div[@id="titleStoryLine"]/div[@class="txt-block"]/text()'). extract()).strip() url = response.request.url poster = response.xpath( '//div[@class="poster"]//a/img/@src').extract_first() trailer_img = response.xpath( '//div[@class="slate"]//a/img/@src').extract_first() req_headers = self.headers_format(response.request.headers) res_headers = self.headers_format(response.headers) # Cleaning inputs if not movie_id or not title: return # convert released_date unicode into string film_rating = film_rating.encode('ascii', 'ignore') film_rating = film_rating.strip( ) if film_rating and type(film_rating) is str else '' # convert released_date unicode into string release_date = release_date.encode('ascii', 'ignore') release_date = release_date.strip( ) if release_date and type(release_date) is str else '' # if it's a movie, it will be in "11, MARCH 2013 (USA)"format # split string into time only without country name, then convert into datetime and unix time if release_date[0].isdigit() == True: release_date_unix_time = parser.parse(release_date.split("(")[0]) release_date_unix_time = time.mktime( release_date_unix_time.timetuple()) # if it's a TV series, it will be in "TV SERIES (2013 - ?)"format # split string into only 4 digit year, then convert into datetime and unix time if release_date.split("(")[1][0:4].isdigit(): release_date_unix_time = parser.parse( "1, Jan " + release_date.split("(")[1][0:4]) release_date_unix_time = time.mktime( release_date_unix_time.timetuple()) # convert duration unicode into string if (duration is not None): duration = duration.encode('ascii', 'ignore') duration = duration.strip( ) if duration and type(duration) is str else '' # duration is in "1h 40min" format, split int out from string into array ["1","40"] hour_min = (re.findall(r'\d+', duration)) # if hour_min array has 2 elements, then first element will be hour and second will be min if (len(hour_min) == 2): duration = (int(hour_min[0]) * 60 + int(hour_min[1])) # if hour_min array has 1 elements, then it could be minute or hour if (len(hour_min) == 1): # if hour_min has hour element like ["3h"], then last char would be h if (duration[-1:] == "h"): duration = (int(hour_min[0]) * 60) # else it would be min else: duration = (int(hour_min[0])) imdb_ratingValue = self.input2num(imdb_ratingValue) imdb_ratingCount = self.input2num(imdb_ratingCount) imdb_bestRating = self.input2num(imdb_bestRating) # convert description unicode into string description = description.encode('ascii', 'ignore') description = description.strip( ) if description and type(description) is str else '' # convert storyline unicode into string storyline = storyline.encode('ascii', 'ignore') storyline = storyline.strip( ) if storyline and type(storyline) is str else '' # Output item = ImdbScraperItem() item['movie_id'] = movie_id item['title'] = title item['film_rating'] = film_rating item['poster'] = poster item['trailer_img'] = trailer_img item['duration'] = duration item['genre'] = genre item['release_date'] = release_date item['imdb_ratingValue'] = imdb_ratingValue item['imdb_bestRating'] = imdb_bestRating item['imdb_ratingCount'] = imdb_ratingCount item['description'] = description item['release_date_unix_time'] = release_date_unix_time item['storyline'] = storyline item['director'] = credits.get('director', '') item['writer'] = credits.get('writer', '') item['creator'] = credits.get('creator', '') item['stars'] = credits.get('stars', '') item['taglines'] = taglines item['url'] = url item['req_headers'] = req_headers item['res_headers'] = res_headers yield item def input2num(self, iput): regnum = re.compile("^(?=.*?\d)\d*[.,]?\d*$") if iput: if iput.isdigit(): return float(iput) oput = iput.replace(",", "") if regnum.match(oput): return float(oput) return -1 def headers_format(self, header): hdr = {} for key, value in header.items(): if isinstance(key, (bytes, bytearray)): hdr[key.decode('utf-8')] = b''.join(value).decode('utf-8') else: hdr[key] = ''.join(value) return json.dumps(hdr, ensure_ascii=False)
ignoreDomains = [domain.strip() for domain in g.readlines()] g.close() if len(ignoreDomains) == 0: print "ignoreDomains.txt empty. No domains to be ignored initially." for domain in ignoreDomains: dcount.set_ignored_domain(domain) except IOError, e: print "No ignoreDomains.txt found. No domains to be ignored initially." # The spider follows this rule for each group of links encountered on a page rules = (Rule(LxmlLinkExtractor(allow=[r'.+\.(com|org|net|).*'], deny=[ r'.+\.(jpg|png|pdf|mp4|mp3|zip| \ torrent|mov|gif|txt|csv|webm|epub)' ], deny_domains=dcount.get_ignored_domains(), unique=True), callback='parse_item', process_links='process_links', follow=True), ) def process_links(self, links): """ Called for each list of links collected by the spider. Discards those links which have domains in ignoreDomains. :param links: A list of scraped Link objects collected by the spider :return: a list of Link objects from "good" domains. """
class MopSpider(CrawlSpider): name = 'people_bbs' start_urls = ['http://bbs1.people.com.cn/'] post_extract = LxmlLinkExtractor( allow=( '/post/', ), allow_domains=( 'bbs1.people.com.cn' ), # deny=( # # ), # deny_domains=( # # ) ) author_extract = LxmlLinkExtractor( allow=( '/userInfo\.do\?', ), allow_domains=( 'bbs1.people.com.cn', ), deny=( '/userInfo\.do\?action=thread', '/userInfo\.do\?action=follow', '/userInfo\.do\?action=jinghua', '/userInfo\.do\?orderBy=', ), # deny_domains=( # # ) ) follow_extract = LxmlLinkExtractor( # allow=( # '/s/[0-9]+', # ), allow_domains=( 'bbs1.people.com.cn', ), deny=( '/userInfo\.do\?action=thread', '/userInfo\.do\?action=follow', '/userInfo\.do\?action=jinghua', '/userInfo\.do\?orderBy=', ), # deny_domains=( # 'q.blog.sina.com.cn' # ) ) rules = ( Rule(author_extract, follow=True, callback='parse_author'), Rule(post_extract, follow=True, callback='parse_post'), # Rule(follow_extract, follow=True, callback='parse_follow'), Rule(follow_extract, follow=True, process_request=), ) # # a_count = 0 # p_count = 0 # f_count = 0 def parse_author(self, response): # self.a_count += 1 # print('author: ', self.a_count, ' ', response.url) author_item = get_author_item(response) if author_item: yield author_item def parse_post(self, response): # self.p_count += 1 # print('post: ', self.p_count, ' ', response.url) post_item = get_post_item(response) content_href = post_item['content_href'] if content_href: yield Request( url=content_href, callback=self.parse_content, meta={ 'post_item': post_item } ) else: pass def parse_content(self, response): post_item = response.meta['post_item'] content, picture_hrefs = get_post_content(response) post_item['content'] = content post_item['picture_hrefs'] = picture_hrefs for comment_item in get_comment_list(response): post_item['comment_ids'].append(comment_item['comment_id']) yield comment_item yield post_item
class PharmnetCrawlSpider(CrawlSpider): """医药网pharmnet.com.cn""" name = 'pharmnet' allowed_domains = ['pharmnet.com.cn'] start_urls = [ 'http://news.pharmnet.com.cn/news/hyyw/news/index0.html', # 'http://news.pharmnet.com.cn/news/hyyw/news/index1.html', ] rules = ( # LxmlLinkExtractor提取链接列表 Rule(LxmlLinkExtractor(allow=(r'/news/\d{4}/\d{2}/\d{2}/\d+\.html', r'/news/hyyw/news/index\d+\.html'), restrict_xpaths=('//div[@class="list"]', '//div[@class="page"]')), callback='parse_links', follow=False), ) def parse_links(self, response): # 如果是首页文章链接,直接处理 if '/hyyw/' not in response.url: yield self.parse_page(response) else: self.log('-------------------> link_list url=%s' % response.url, logging.INFO) links = response.xpath('//div[@class="list"]/ul/li/p/a') for link in links: url = link.xpath('@href').extract()[0] yield Request(url=url, callback=self.parse_page) def parse_page(self, response): try: self.log('-------------------> link_page url=%s' % response.url, logging.INFO) item = NewsItem() item['crawlkey'] = self.name item['category'] = ltos( response.xpath( '//div[@class="current"]/a[last()]/text()').extract()) item['link'] = response.url head_line = ltos( response.xpath('//div[@class="ct01"]/text()[1]').extract()) item['location'] = head_line.strip().split()[1] item['pubdate'] = datetime.strptime(head_line.strip().split()[0], '%Y-%m-%d') item['title'] = ltos(response.xpath('//h1/text()').extract()) content_temp = "".join([ tt.strip() for tt in response.xpath( '//div[@class="ct02"]/font/div/div|//div[@class="ct02"]/font/div' ).extract() ]) item['content'] = filter_tags(content_temp) hc = ltos(response.xpath('//div[@class="ct02"]').extract()) htmlcontent = clean_html(hc) # 特殊构造,不作为分组 # (?=...)之后的字符串需要匹配表达式才能成功匹配 # (?<=...)之前的字符串需要匹配表达式才能成功匹配 pat_img = re.compile(r'(<img (?:.|\n)*?src=")((.|\n)*?)(?=")') uuids = [] for i, m in enumerate(pat_img.finditer(htmlcontent)): full_path = m.group(2) suffix_name = '.' + os.path.basename(full_path).split('.')[-1] uuid_name = '{0:02d}{1:s}'.format( i + 1, uuid.uuid4().hex) + suffix_name uuids.append(uuid_name) self.log('UUID_PIC--------%s' % setting.URL_PREFIX + uuid_name, logging.INFO) with contextlib.closing(urllib2.urlopen(full_path)) as f: with open(os.path.join(IMAGES_STORE, uuid_name), 'wb') as bfile: bfile.write(f.read()) for indx, val in enumerate(uuids): htmlcontent = pat_img.sub( Nth(indx + 1, setting.URL_PREFIX + val), htmlcontent) item['htmlcontent'] = htmlcontent self.log( '+++++++++title=%s+++++++++' % item['title'].encode('utf-8'), logging.INFO) return item except: self.log('ERROR-----%s' % response.url, logging.ERROR) raise DropItem('DropItem-----%s' % response.url)