def main_parse(self, response): le = LinkExtractor(restrict_css='div.all_member table.sy_table') print(len(le.extract_links(response))) for link in le.extract_links(response): examples = ExampleItem() examples['url_examples'] = [link.url] yield examples
def parse(self, response): le = LinkExtractor(restrict_css='article') if le: for link in le.extract_links(response): yield scrapy.Request(link.url, callback=self.parse_page) le = LinkExtractor(restrict_css='li.next') links = le.extract_links(response) if links: next_url = links[0].url yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): le = LinkExtractor(restrict_css='article.product_pod h3') for link in le.extract_links(response): yield scrapy.Request(link.url, callback=self.parse_book) le = LinkExtractor(restrict_css='ul.pager li.next') links = le.extract_links(response) if links: next_url = links[0].url yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): book_list = response.xpath("//ol[@class='row']//li") for books in book_list: book_message = BooksItem() book_message['book_title'] = books.xpath( './/h3/a/@title').extract_first() book_message['price'] = books.xpath( ".//div[@class='product_price']/p/text()").extract_first() book_message[ 'book_link'] = 'http://books.toscrape.com/' + books.xpath( "./article/div/a/@href").extract_first() yield book_message ''' # 使用selector提取下一页的链接 next_url = response.xpath("//ul[@class='pager']/li[@class='next']/a/@href").extract() if next_url: next_page = response.urljoin(next_url[0]) yield scrapy.Request(next_page, callback=self.parse) ''' # 使用linkExtractor提取下一页的链接 le = LinkExtractor(restrict_css='ul.pager li.next') links = le.extract_links(response) if links: next_url = links[0].url yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): pattern = '/gsschool/.+\.shtml' link = LinkExtractor(allow=pattern) links = link.extract_links(response) print(type(links)) for link in links: print(link)
def parse(self, response): body = Selector(text=response.body) images = body.css('img.image-section__image').extract() result = body.css( 'img.image-section__image ::attr(src)').extract_first() #yield {'img_url': result} yield ImagecrawlerItem(file_urls=[result]) if len(os.listdir('/work/imagecrawler/output/full')) > 0: for name in os.listdir('/work/imagecrawler/output/full'): temp = name.find('?') new_name = name[:temp] shutil.copy('/work/imagecrawler/output/full/' + name, '/work/imagecrawler/output/result/' + new_name) # body.css().extract() returns a list which might be empty #for image in images: #img_url = PexelsScraper.src_extractor.findall(image)[0] #tags = [tag.replace(',', '').lower() for tag in PexelsScraper.tags_extractor.findall(image)[0].split(' ')] #yield {'img_url': img_url} #yield {'img_tags': tags} link_extractor = LinkExtractor(allow=PexelsScraper.url_matcher) next_links = [ link.url for link in link_extractor.extract_links(response) if not self.is_extracted(link.url) ] #yield {'next_links': next_links} for link in next_links: yield scrapy.Request(link, self.parse)
def parse(self, response): le = LinkExtractor(restrict_css='div.toctree-wrapper.compound', deny='/index.html$') links = le.extract_links(response) print(len(links)) print(links) for link in links: yield Request(link.url, callback=self.parse_example)
def parse(self, response): link = LinkExtractor(restrict_xpaths="//ul[@class='cont_xiaoqu']//li") links = link.extract_links(response) for link_line in links: print(link_line.url,link_line.text) item = LinkdemoItem() item["url"] = link_line.url item["text"] = link_line.text yield item
def parse(self, response): link = LinkExtractor( restrict_css= 'body > div.wrap > div.middleright > div > div.cartoon_online_border > ul > li' ) links = link.extract_links(response) # link1 = link.extract_links(response)[0] for link in links: yield Request(url=link.url, callback=self.parse2, dont_filter=True)
def parse(self, response): link_regulation = LinkExtractor(restrict_css='section') url_list = link_regulation.extract_links(response) if url_list: for link in url_list: url = link.url if 'page-' in url: yield scrapy.Request(url, callback=self.parse) else: yield scrapy.Request(url, callback=self.parse_detail)
def parse(self, response): body = Selector(text=response.body) images = body.css('img').extract() for image in images: image = image.encode("utf-8") if PexelsScraper.src_extractor.findall(image): img_url = PexelsScraper.src_extractor.findall(image)[0] if img_url not in PexelsScraper.crawled_urls: if 'http' not in img_url: print img_url print self.start_urls[0] print PexelsScraper.domain_extractor.findall( self.start_urls[0]) img_url = PexelsScraper.domain_extractor.findall( self.start_urls[0])[0][0] + img_url print img_url PexelsScraper.crawled_urls.add(img_url) tags = "" img_name = "" img_type = "" if PexelsScraper.tags_extractor.findall(image): tags = PexelsScraper.tags_extractor.findall( image)[0].replace(',', '').lower() print img_url, tags if '/' in img_url and len( PexelsScraper.filename_extractor.findall( img_url)) > 0: img_name = PexelsScraper.filename_extractor.findall( img_url)[0][0] img_type = PexelsScraper.filename_extractor.findall( img_url)[0][1] print img_name data = requests.get(img_url).content im = Image.open(BytesIO(data)) width, height = im.size # PexelsScraper.image_width = im.size[0] # PexelsScraper.image_height = im.size[1] img_aspect_ratio = self.calculate_aspect(width, height) yield ImagecrawlerItem(source_url=response.url, img_url=img_url, alternate_text=tags, img_width=width, img_height=height, img_name=img_name, img_type=img_type, img_aspect_ratio=img_aspect_ratio) link_extractor = LinkExtractor() next_links = [ link.url for link in link_extractor.extract_links(response) if not self.is_extracted(link.url) ] # Crawl the filtered links for link in next_links: yield scrapy.Request(link, self.parse)
def parse(self, response): link_extractor = LinkExtractor(allow=RotaractSpider.url_matcher) links = [link.url for link in link_extractor.extract_links(response)] for link in links: flag = True article_links = [] yield scrapy.Request(url=link, callback=self.parse_articles, meta={ 'article_links': article_links, 'flag': flag })
def parse(self, response): link = LinkExtractor( deny='/fang1/a2/', restrict_xpaths= '//div[@class="f-filter f-w1190"]//dd[@class="info"]/div[@class="thr-list"]//li[@class="item"]/a' ) links = link.extract_links(response) for i in links: city_name = re.split('\/', i.url)[-3] yield Request(i.url, callback=self.get_index, meta={ 'city_name': city_name, 'dont_redirect': True }, dont_filter=True)
def callload(self,response): link = LinkExtractor(restrict_xpaths='//*[@cellspacing="1"]//a') link = link.extract_links(response) for urllist in link: url=urllist.url if url in self.loaded: pass else: self.loaded.append(url) request = scrapy.Request(url, callback=self.parse, headers={'User-Agent': 'Mozilla/5.0'}, dont_filter=True) path = self.path + '/'+urllist.text request.meta['item'] = path yield request time.sleep(2)
def get_index(self, response): city_name = response.meta['city_name'] link = LinkExtractor( allow='/fang1/.*htm', restrict_xpaths= '//div[@class="f-main f-clear f-w1190"]//div[@class="f-main-list"]/div[@class="f-list js-tips-list"]/div' ) links = link.extract_links(response) for i in links: city = re.split('\/|\.', i.url)[2] yield Request(i.url, callback=self.get_message, meta={ 'city': city, 'city_name': city_name, 'dont_redirect': True }, dont_filter=True)
def parse(self, response): USER = True next_links = [] body = Selector(text=response.body) images = body.css('img.photo-item__img').extract() for image in images: img_url = PexelsScraper.src_extractor.findall(image)[0] tags = [ tag.replace(',', '').lower() for tag in PexelsScraper.tags_extractor.findall(image)[0].split(' ') ] print("Tags_check: ") print tags link_extractor = LinkExtractor(allow=PexelsScraper.url_matcher) next_links = [ link.url for link in link_extractor.extract_links(response) if not self.is_extracted(link.url) ] # Crawl the filtered links next_page_url = response.css( 'div.pagination a[rel="next"]::attr(href)').extract_first() if next_page_url: next_page_url = URL + next_page_url next_links.append(next_page_url) print("next_page_url") print(next_page_url) if USER: links = response.css("a.pull-left::attr(href)").extract_first() print(links) if links: links = "https://www.pexels.com" + links for i in range(10): next_links.append(links + "?page=" + str(i)) print("go into user parse") #request.meta['main_url'] = URL #yield request for each in next_links: yield scrapy.Request(each, self.parse_by_user) print("should have done user parse") print("Links_check: {}".format(links)) for link in next_links: print("next_links") print link yield scrapy.Request(link, self.parse)
def parse(self, response): for info in response.css('.product_pod'): item = BooksItem() # print(info) item['name'] = info.css('h3>a::attr(title)').extract_first() # name = info.xpath('./h3/a/@title').extract_first() # print(name) item['price'] = info.css( '.product_price .price_color::text').extract_first() # price = info.xpath('//p[@class="price_color"]/text()').extract() # print(price) yield item bookstr = item['name'] + '\t' + item['price'] + '\n' self.f.write(bookstr) le = LinkExtractor(restrict_css='ul.pager li.next') links = le.extract_links(response) if links: next_url = links[0].url yield scrapy.Request(next_url, callback=self.parse) '''next_url = response.css('.pager .next>a::attr(href)').extract_first()
def parse(self, response): print response body = Selector(text=response.body) images = body.css('img.image-section__image').extract() print images # body.css().extract() returns a list which might be empty for image in images: img_url = Scraper.src_extractor.findall(image)[0] print img_url link_extractor = LinkExtractor(allow=Scraper.url_matcher) next_links = [ link.url for link in link_extractor.extract_links(response) if not self.is_extracted(link.url) ] # Crawl the filtered links for link in next_links: yield scrapy.Request(link, self.parse)
def parse(self, response): link = LinkExtractor(restrict_xpaths=self.restrict_xpaths) links = link.extract_links(response) for i in links: self.item['link'] = i.url self.item['text'] = i.text self.item['date'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') self.item['status'] = 0 _rule = self.rule print(self.rule) _rule['start_urls'] = [i.url] _rule['name'] = self.next_name _rule['step'] = self.next_step self.item['rule'] = _rule print(_rule) backend = RedisBackend(REDIS_CONF) backend.send('%s_%s' % (REDIS_KEY, self.next_name), str(self.item)) # print self.item yield self.item
def parse(self, response): def get_links(obj): if obj: for link in obj: if link not in data: data.append({ "referer": response.request.headers.get('Referer', None), "url": link }) return if "text" not in str(response.headers["Content-Type"]): get_links([response.url]) else: body = Selector(text=response.body) pdfs = body.css('a[href$=".pdf"]::attr(href)').extract() csvs = body.css('a[href$=".csv"]::attr(href)').extract() xl1 = body.css('a[href$=".xls"]::attr(href)').extract() xl2 = body.css('a[href$=".xlsx"]::attr(href)').extract() doc1 = body.css('a[href$=".doc"]::attr(href)').extract() doc2 = body.css('a[href$=".docx"]::attr(href)').extract() link_objs = [pdfs, csvs, xl1, xl2, doc1, doc2] link_extractor = LinkExtractor(allow=self.allowed_domains) next_links = [ link.url for link in link_extractor.extract_links(response) if not self.is_extracted(link.url) ] # Crawl the filtered links for link in next_links: yield scrapy.Request(link, self.parse) [get_links(obj) for obj in link_objs]
def parse(self, response): link = LinkExtractor(restrict_xpaths=self.restrict_xpaths) links = link.extract_links(response) for i in links: self.item['link'] = i.url self.item['text'] = i.text self.item['date'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') self.item['status'] = 0 _rule = self.rule _rule['start_urls'] = [i.url] _rule['name'] = self.next_name _rule['step'] = self.next_step self.item['rule'] = _rule print(self.rule) self.item['appname'] = response.xpath( self.rule['app_name']).extract()[0] downloadnumber = response.xpath( self.rule['downloadnumber']).extract()[0] self.item['downloadnumber'] = re.findall('\d+', downloadnumber)[0] downloadapp(i.url, _rule, self.item) # print self.item yield self.item
def parse(self, response): body = Selector(text=response.body) images = body.css('img.image-section__image').extract() # images is lists included of all tags of a class named 'img.image-section__image' # body.css().extract returns a list which might be empty for image in images: img_url = PexelsScraper.src_extractor.findall(image)[0] tags = [ tag.replace(',', '').lower() for tag in PexelsScraper.tags_extractor.findall(image)[0].split(' ') ] img_type = str(img_url.split('/')[-1].split('.')[1].split('?')[0]) # Set the full_path of a image file img_fullname = Global.img_name + str(Global.img_count) + img_type img_fullpath = Global.img_path + img_fullname # Download the image response = requests.get(img_url) with open(img_fullpath + '.' + img_type, 'wb') as f: f.write(response.content) del response # Print the result to the console print(img_fullname, img_url, tags) Global.img_count = Global.img_count + 1 link_extractor = LinkExtractor(allow=PexelsScraper.url_matcher) next_links = [ link.url for link in link_extractor.extract_links(response) if not self.is_extracted(link.url) ] # Crawl the filtered links for link in next_links: yield scrapy.Request(link, self.parse)
def parse(self, response): # try : self.crawled.append(response.url) path=response.meta['item'] # except Exception as e: # print(e) font = response.xpath('//font/text()[3]').extract()[0] title=response.xpath('/html/head/title/text()').extract()[0] title = re.findall(r'(.*?) -', title)[0] title = title.replace(':','-') title = title.replace(':', '-') title = title.strip() title=title.strip('\t') if self.signal=='y': path=path repath = path else: path=path+'/'+title repath = path self.signal='y' # repath = path localnumm=[] localnumm.append(title) if font == '>文章内容': path=repath + '/' + title localnumm.append(path) isExists = os.path.exists(path) if not isExists: os.makedirs(path) link = LinkExtractor(restrict_xpaths='//*[@cellspacing="1"]//a') link = link.extract_links(response) number = 0 # for sel in link: # if sel.url in self.loaded: # pass # else: # self.loaded.append(sel.url) # number = number + 1 localnumm.append(response.url) localnumm.append(number) for sel in link: if sel.url in self.loaded: pass else: self.loaded.append(sel.url) number=number+1 localnumm[3] = number self.i=self.i+1 # print(sel.url) titem=transitem() titem['repath']=repath titem['path']=path titem['list']=localnumm request=scrapy.Request(sel.url, callback=self.download_parse, headers={'User-Agent': 'Mozilla/5.0'}, dont_filter=True) request.meta['item']=titem yield request self.numm.append(localnumm)
def parse(self, response): link = LinkExtractor(restrict_xpaths='//ul[@class="list"]/li') links = link.extract_links(response) print(links) print(link)
def parse_index(self, response): link = LinkExtractor(allow=r'2015/ssy/[a-z]{1,}(.htm)') links = link.extract_links(response) for link in links: yield scrapy.Request(link.url, callback=self.parse_category)
def parse(self, response): link = LinkExtractor(allow=r'http://www.nncc626.com/[a-z]{1,}(.htm)', deny=r'index') links = link.extract_links(response) for link in links: yield scrapy.Request(link.url, callback=self.parse_index)
def parse(self, response): le = LinkExtractor(restrict_css='div.sphx-glr-thumbcontainer p.caption' ,deny='/index.html$') for link in le.extract_links(response): yield scrapy.Request(link.url, callback=self.page_parser)
def parse(self, response): # def get_proxy(): # return requests.get("http://127.0.0.1:5010/get/").content # # def delete_proxy(proxy): # requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) #或者可以设置随机ip #不需要在这里设置,在retry中间件中设置即可 #轮询使用ip,假设有500可用ip,一分钟500个页面,对服务器来说相当于每台主机访问一页面 # while response.status == 403 or response.status == 302: # # print(response.status) # # print(response.meta) # # # delete_proxy(response.headers) # # #删除proxy # # # 获取proxy # # proxy = get_proxy() # # print("使用新代理:" + str(proxy)) # # #如果proxy_pool耗尽,暂时暂停爬虫或者更换目标网站,移动端或者wap,或者各大网站的cache # # response = scrapy.Request(url=response.url, meta={'proxy':'http://' + str(proxy)}) # # print(type(response)) # print("有respose") item = LearningItem() #爬取书名 #作者有联合作者,会和译者一样放在一个span里面,单个作者单独放在文本为 作者 的span 的后面的同级a节点,所以也要分类讨论 #或者作者无链接——不会,会有search #单个作者也会用一组嵌套的span括住 #翻译者的链接也是author,既然是爬取图书,就没有关系了,如果要研究翻译相关的话,主数据库有译者字段 def is_exist(item_argv, xpath1, **xpath2): # item[item_argv] = info.xpath(xpath1).extract().strip() try: item[item_argv] = info.xpath(xpath1).extract() except: print(str(item_argv) + "出错") item[item_argv] = '' if len(item[item_argv]) == 1: item[item_argv] = item[item_argv][0].strip() # if len(item[item_argv]) == 0 and item[item_argv] != '': # # item[item_argv] = '' # return item[item_argv][0].strip() if len(item[item_argv]) == 1 else item[item_argv] return item[item_argv] # try: #先确定豆瓣会出错的几种方式 #返回403 #返回200,但需登陆 #返回此应用出错 # print("尝试爬取") # except: # print() # print("被ban!!!!!!!!!!!!!") #只会停止其中一个协程,其他要逐渐停止,强行ctrl + z 会导致后面的链接被添加到filter中,以后都不会再被爬取 if response.status != 200: #不知道会不会将缺少 '/"的页面重定向到别的地方,导致状态码变为301,改next_page的代码 #shell后发现不会,重定向会直接返回200的response,服务器补全了后面的 / raise CloseSpider('强制停止!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') # time.sleep(600) # raise CloseSpider() # return ##这里写ADSL拨号或者换ip的逻辑 # print() # return print("此时的URL为:" + str(response.url)) # writer_link_list = [] # series_link_list = [] try: info = response.xpath(u'//*[@id="info"]')[0] except: raise CloseSpider("出现200以外的错误,此时的url为 %s" % response.url) #在这里一并处理了作者列表和翻译者列表 #判断有无作者 #判断有无翻译者 #翻译者以上的author link 的text 加入到作者列表中 #如无翻译者,则author link 的 text 默认为全是作者 #容易出错,比如出现个志愿者什么的,举例而已 #作者节点:作者节点的下一个同辈span节点的所有前同辈a节点,因为作者节点排第一,没有其他节点会影响它 #先确定是两种模式的哪一种 #直接写四种模式,用 a = b or c = d的写法,一句 #如果以某个字段为基准,比如出版社以上的a tag 为作者,以下为翻译者的话,当出版社字段不存在,就会出错,所以还是以自身为基准,爬虫会更具健壮性 #有冒号无嵌套 w_name1 = info.xpath( u'//span[./text()="作者:"]/following-sibling::span[1]/preceding-sibling::a' ) #有冒号有嵌套 w_name2 = info.xpath(u'//span[./text()="作者:"]/parent::span/a') #无冒号无嵌套 w_name3 = info.xpath( u'//span[./text()=" 作者"]/following-sibling::span[1]/preceding-sibling::a' ) #无冒号有嵌套 w_name4 = info.xpath(u'//span[./text()=" 作者"]/parent::span/a') if w_name1: item['writers'] = w_name1.xpath("./text()").extract() item['writers_link'] = w_name1.xpath("./@href").extract() elif w_name2: item['writers'] = w_name2.xpath("./text()").extract() item['writers_link'] = w_name2.xpath("./@href").extract() elif w_name3: item['writers'] = w_name3.xpath("./text()").extract() item['writers_link'] = w_name3.xpath("./@href").extract() elif w_name4: item['writers'] = w_name4.xpath("./text()").extract() item['writers_link'] = w_name4.xpath("./@href").extract() else: item['writers'] = '' item['writers_link'] = '' #————————————————————————————————————————————————————————————————————————————————————————————————————————————————# #译者 # contains(@name,'na') #有冒号无嵌套 t_name1 = info.xpath( u'//span[./text()="译者:"]/following-sibling::a[contains(@href,"search")]' ) #有冒号有嵌套 t_name2 = info.xpath( u'//span[./text()="译者:"]/following-sibling::a[contains(@href,"author")]' ) #无冒号无嵌套 #选中属性中包含某个字符串的href #链接可以直接爬取了,但是中文字段还是要靠后续的处理和提取 #出错 #仍有问题,无法替换和正确拼接 # t_name3 = info.xpath(u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"search") or contains(@href,"author")]') t_name3 = info.xpath( u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"search")]' ) #无冒号有嵌套 t_name4 = info.xpath( u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"author")]' ) if t_name4: item['translators'] = t_name4.xpath("./text()").extract() item['translators_link'] = t_name4.xpath("./@href").extract() elif t_name3: item['translators'] = t_name3.xpath("./text()").extract() item['translators_link'] = t_name3.xpath("./@href").extract() elif t_name2: item['translators'] = t_name2.xpath("./text()").extract() item['translators_link'] = t_name2.xpath("./@href").extract() elif t_name1: item['translators'] = t_name1.xpath("./text()").extract() item['translators_link'] = t_name1.xpath("./@href").extract() else: item['translators'] = '' item['translators_link'] = '' #————————————————————————————————————————————————————————————————————————————————————————————————————————————————# item["publish"] = is_exist( "publish", u'//span[./text()="出版社:"]/following::text()[1]') item["publish_date"] = is_exist( "publish_date", u'//span[./text()="出版年:"]/following::text()[1]') item["pages"] = is_exist( "pages", u'//span[./text()="页数:"]/following::text()[1]') item["price"] = is_exist( "price", u'//span[./text()="定价:"]/following::text()[1]') item["binding"] = is_exist( "binding", u'//span[./text()="装帧:"]/following::text()[1]') item["ISBN"] = is_exist( "ISBN", u'//span[./text()="ISBN:"]/following::text()[1]') item["orgin_name"] = is_exist( "orgin_name", u'//span[./text()="原作名:"]/following::text()[1]') item["series"] = is_exist( "series", u'//span[./text()="丛书:"]/following::a[1]/text()') item["series_link"] = is_exist( "series_link", u'//span[./text()="丛书:"]/following-sibling::a[1]/@href') # item["summary"] = is_exist("summary",) # item["w_summary"] = is_exist("w_summary",) item["catalog"] = is_exist("catalog", '//*[contains(@id,"dir_")]/text()') item["tag"] = is_exist("tag", '//*[@id="db-tags-section"]/div/span/a/text()') item["series_info"] = is_exist( "series_info", '//*[@id="content"]/div/div[1]/div[3]/div[@class="subject_show block5"]/div//text()' ) # item["readers"] = is_exist("readers",).extract().strip() # item["title"] = is_exist("title",).extract().strip() # item["url"] = is_exist("url",).extract().strip() # item["score"] = is_exist("score",).extract().strip() try: item['title'] = response.xpath( "//*[@id='wrapper']/h1/span/text()").extract_first() except: item['title'] = '' item['url'] = response.url.replace("https://book.douban.com/subject/", "").strip('/') try: item['score'] = response.css( '#interest_sectl > div > div.rating_self.clearfix > strong::text' ).extract_first().strip() if item['score'] == '': item['score'] = '0' except: item['score'] = '0' # try: # item['publish'] = info.xpath().extract_first().strip() # except: # item['publish'] = '' # try: # item['publish_date'] = info.xpath(u'//span[./text()="出版年:"]/following::text()[1]').extract_first().strip() # except: # item['publish_date'] = '' # try: # item['pages'] = info.xpath(u'//span[./text()="页数:"]/following::text()[1]').extract_first().strip() # except: # item['pages'] = '' # try: # item['price'] = info.xpath(u'//span[./text()="定价:"]/following::text()[1]').extract_first().strip() # except: # item['price'] = '' # try: # item['binding'] = info.xpath(u'//span[./text()="装帧:"]/following::text()[1]').extract_first().strip() # except: # item['binding'] = '' # try: # item['ISBN'] = info.xpath(u'//span[./text()="ISBN:"]/following::text()[1]').extract_first().strip() # except: # item['ISBN'] = '' # try: # item['orgin_name'] = info.xpath(u'//span[./text()="原作名:"]/following::text()[1]').extract_first().strip() # except: # item['orgin_name'] = '' # try: # item['series'] = info.xpath(u'//span[./text()="丛书:"]/following::a[1]/text()').extract_first().strip() # except: # item['series'] = '' # try: # item['series_link'] = info.xpath(u'//span[./text()="丛书:"]/following-sibling::a[1]/@href').extract_first().strip() # except: # item['series_link'] = '' #这里有两种情况,一种有折叠,一种没有,先提取包含折叠内容的,没有再提取另一个 try: summary = response.xpath( '//*[@id="link-report"]/span/div/div[@class="intro"]/p/text()') if summary: item['summary'] = summary.extract() else: item['summary'] = response.xpath( '//*[@id="link-report"]/div[1]/div/p/text()').extract() # if len(item['summary']) == 0 and item['summary'] != '': # # item['summary'] = '' except: item['summary'] = '' try: w_summary = response.css( '#content > div > div.article > div.related_info > div:nth-child(4) > span.all.hidden > div > p::text' ) if w_summary: item['w_summary'] = w_summary.extract() else: item['w_summary'] = response.css( '#content > div > div.article > div.related_info > div:nth-child(4) > span.short > div > p::text' ).extract() # if len(item['w_summary']) == 0 and item['w_summary'] != '': # # item['w_summary'] = '' except: item['w_summary'] = '' # try: # #出错 # # item['catalog'] = response.xpath('//*[contains(@id,"full") and contains(@id,"dir")]/text()').extract() # item['catalog'] = response.xpath('//*[contains(@id,"dir_")]/text()').extract() # except: # item['catalog'] = '' # try: # item['tag'] = response.xpath('//*[@id="db-tags-section"]/div/span/a/text()').extract() # except: # item['tag'] = '' # try: # #丛书信息会随机抽取 # item['series_info'] = response.xpath('//*[@id="content"]/div/div[1]/div[3]/div[@class="subject_show block5"]/div//text()').extract() # except: # item['series_info'] = '' try: item['readers'] = response.css( '#interest_sectl > div > div.rating_self.clearfix > div > div.rating_sum > span > a > span::text' ).extract_first() if item['readers'] is None: item['readers'] = '0' except: item['readers'] = '0' # '//*[@id="link-report"]/div[1]/div/p'/div/div[@class="intro"]/p/text() # if w_name_mode1: # # w_name = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()').extract_first().replace("\n","").replace(" ","") # w_name = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()') # #如果能捕获作者名字,则写入,否则,为span嵌套模式 # if w_name: # item['writer'] = w_name.extract() # else: # item['writer'] = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()') # / # writer_name_type2 = links.xpath('//span[./text()=" 作者"]/following-sibling::span[1]/preceding-sibling::a/text()').extract_first().replace("\n","").replace(" ","") # writer_name_type3 = # #单个作者节点已经完成,需要完成一组的作者节点,具体参考大学教材 # #一组作者节点同一组翻译者节点 # #翻译者节点:翻译者节点的下一个span节点 # #一组翻译者的已经解决,单个翻译者的参考傅雷 # # link_extract = item.extract() # if "author" in link: # # print(item.xpath('./@href').extract()) # #这里可以缩减 # writer_link_list.append(link) # #存储完整的网址,日后爬取可以少一个拼接网址的逻辑,加快爬取速度,硬盘开销不大 # if "search" in link: # link = "https://book.douban.com/" + link # writer_link_list.append(link) # if "series" in link: # series_link_list.append(link) # item['writer_link'] = writer_link_list # item['series_link'] = series_link_list # # item['writer'] = response.xpath(u'//span[./text()="作者:"]/following::a[2]') # # # // *[ @ id = "info"] / a[1] # # item['publish'] = response.xpath(u'//span[./text()="出版社:"]/following::text()[1]') # # item['orgin_name'] = response.xpath(u'//span[./text()="原作名:"]/following::text()[1]') # #这里只是其中一种情况,还有一种,要增加对应的try...except,以及中文图书没有翻译的问题,全半角符号的问题 # c = ""#单个翻译者 # try: # if a: # item['translator'] = a[0].xpath('./a/text()').extract() # if b: # item['translator'] = b[0].xpath('./a/text()').extract() # except: # item['translator'] = '' #有效评分人数 # if item['readers']: # v = int(item['readers']) # else: # v = 0 # #入选top250的最低人数 # m = 10000 # #书本得分 # if item['score']: # R = float(item['score']) # else: # R = 0 # # C是所有书本的得分平均分,都存在数据库中,取个大概值就行了 # C = 7 item["weighting"] = 0 item['seen'] = 0 yield item # item['p_date'] # item['total_pages'] # item['price'] # item['binding'] # item['series'] # item['ISBN'] # item['summary'] # item['w_introduce'] # item['ca'] # item['tag'] # item['s_info'] # item['score'] # item['readers'] # print(item['title']) # all = response.xpath("string(//*[@id='info'])") # all = # print(all.extract()) # print(all.extract()[0].replace("\n","")) # print(all.extract()[0].replace("\n","").replace(" ","")) # print(type(all.extract())) # yield item #id一般固定,可以忽略css的变化 #先不清洗,换取爬取的速度提升 # all = response.xpath('//*[@id="info"]') # all = all.extract()[0].replace("\n","").replace("\t","").split("<br>") # for item in all: # print(item.replace('<spanclass="pl">',"").replace("</span>","").replace("""<divid="info"class="">""","").replace("</div>","").replace("</a>","").replace("""<aclass=""href=""","").replace("<span>","").replace("<ahref=","")) # all = response.xpath(u'//span[./text()=" 作者"]/following::text()') # print(all) #mysql批量写入,不要每次写入 # #抽取"喜欢这本书的用户也喜欢"的链接 link = LinkExtractor( restrict_xpaths=('//*[@id="db-rec-section"]/div//dl//dd')) links = link.extract_links(response) #如果链接是直接相关的话,也可以用response.follow,会返回一个url实例,然后可以yield相关的url: # links = response.xpath('//*[@id="db-rec-section"]/div//dl//dd').extract() # for link in links: # yield response.follow(link,callback=self.parse) for link in links: # print("弹出一个url") # if link.url.endswith('/'): # pass # else: # link.url = link.url + "/" #没有"/"作为结尾的话,网址会重定向,不必要,但是可能是识别爬虫的依据 yield scrapy.Request(url=link.url, callback=self.parse)
def parse(self, response): link = LinkExtractor(restrict_css='div.toctree-wrapper.compound', deny='index.html$') for link in link.extract_links(response): yield scrapy.Request(url=link.url, callback=self.file_parse)
def mvlink(self, response): links_rule = LinkExtractor(allow='/movie/\d+') links = links_rule.extract_links(response) for i in links: sleep(1) yield Request(i.url, callback=self.neirong)